From 93bff73972f629367f41a8b24216410092233c8f Mon Sep 17 00:00:00 2001 From: Akshay Deodhar Date: Wed, 27 Nov 2024 03:43:49 +0000 Subject: [PATCH 01/10] [NVPTX] Support for memory orderings for cmpxchg --- llvm/include/llvm/CodeGen/TargetLowering.h | 8 + llvm/lib/CodeGen/AtomicExpandPass.cpp | 6 +- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 65 + llvm/lib/Target/NVPTX/NVPTXISelLowering.h | 11 + llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 139 +- llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 2 +- llvm/test/CodeGen/NVPTX/atomics.ll | 6 +- llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll | 5680 +++++++++++++++++++ llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll | 5680 +++++++++++++++++++ llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll | 5680 +++++++++++++++++++ llvm/test/CodeGen/NVPTX/cmpxchg.ll | 1629 +++++- llvm/test/CodeGen/NVPTX/cmpxchg.py | 46 + llvm/test/CodeGen/NVPTX/lit.local.cfg | 2 +- 13 files changed, 18815 insertions(+), 139 deletions(-) create mode 100644 llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll create mode 100644 llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll create mode 100644 llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll create mode 100644 llvm/test/CodeGen/NVPTX/cmpxchg.py diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index bbecc7a6ddaee..ac3233ec30ee3 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -2173,6 +2173,14 @@ class TargetLoweringBase { return false; } + // The memory ordering that AtomicExpandPass should assign to a atomic + // instruction that it has lowered by adding fences. This can be used + // to "fold" one of the fences into the atomic instruction. + virtual AtomicOrdering + atomicOperationOrderAfterFenceSplit(const Instruction *I) const { + return AtomicOrdering::Monotonic; + } + /// Whether AtomicExpandPass should automatically insert a trailing fence /// without reducing the ordering for this atomic. Defaults to false. virtual bool diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp index a75fa688d87a8..a3e9700fa3089 100644 --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -324,8 +324,10 @@ bool AtomicExpandImpl::processAtomicInstr(Instruction *I) { // failure path. As a result, fence insertion is directly done by // expandAtomicCmpXchg in that case. FenceOrdering = CASI->getMergedOrdering(); - CASI->setSuccessOrdering(AtomicOrdering::Monotonic); - CASI->setFailureOrdering(AtomicOrdering::Monotonic); + auto CASOrdering = TLI->atomicOperationOrderAfterFenceSplit(CASI); + + CASI->setSuccessOrdering(CASOrdering); + CASI->setFailureOrdering(CASOrdering); } if (FenceOrdering != AtomicOrdering::Monotonic) { diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 58ad92a8934a6..a74eada0afceb 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -42,6 +42,7 @@ #include "llvm/IR/FPEnv.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalValue.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicsNVPTX.h" @@ -49,6 +50,7 @@ #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" #include "llvm/Support/Alignment.h" +#include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CodeGen.h" #include "llvm/Support/CommandLine.h" @@ -995,6 +997,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, // actions computeRegisterProperties(STI.getRegisterInfo()); + // PTX support for 16-bit CAS is emulated. Only use 32+ setMinCmpXchgSizeInBits(STI.getMinCmpXchgSizeInBits()); setMaxAtomicSizeInBitsSupported(64); setMaxDivRemBitWidthSupported(64); @@ -5565,6 +5568,68 @@ NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { return AtomicExpansionKind::CmpXChg; } +bool NVPTXTargetLowering::shouldInsertFencesForAtomic( + const Instruction *I) const { + auto *CI = dyn_cast(I); + // When CAS bitwidth is not supported on the hardware, the CAS is emulated + // using a retry loop that uses a higher-bitwidth monotonic CAS. We enforce + // the memory order using explicit fences around the retry loop. + // The memory order of natively supported CAS operations can be enforced + // by lowering to an atom.cas with the right memory synchronizing effect. + // However, atom.cas only supports relaxed, acquire, release and acq_rel. + // So we also use explicit fences for enforcing memory order for + // seq_cast CAS with natively-supported bitwidths. + return CI && + (cast(CI->getCompareOperand()->getType())->getBitWidth() < + STI.getMinCmpXchgSizeInBits() || + CI->getMergedOrdering() == AtomicOrdering::SequentiallyConsistent); +} + +AtomicOrdering NVPTXTargetLowering::atomicOperationOrderAfterFenceSplit( + const Instruction *I) const { + auto *CI = dyn_cast(I); + bool BitwidthSupportedAndIsSeqCst = + CI && CI->getMergedOrdering() == AtomicOrdering::SequentiallyConsistent && + cast(CI->getCompareOperand()->getType())->getBitWidth() >= + STI.getMinCmpXchgSizeInBits(); + return BitwidthSupportedAndIsSeqCst ? AtomicOrdering::Acquire + : AtomicOrdering::Monotonic; +} + +Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder, + Instruction *Inst, + AtomicOrdering Ord) const { + // Specialize for cmpxchg + if (isa(Inst)) { + // Emit a fence.sc leading fence for cmpxchg seq_cst which are not emulated + if (isReleaseOrStronger(Ord)) + return Ord == AtomicOrdering::SequentiallyConsistent ? + Builder.CreateFence(AtomicOrdering::SequentiallyConsistent) : + Builder.CreateFence(AtomicOrdering::Release); + } else { + return TargetLoweringBase::emitLeadingFence(Builder, Inst, Ord); + } + return nullptr; +} + +Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder, + Instruction *Inst, + AtomicOrdering Ord) const { + // Specialize for cmpxchg + if (isa(Inst)) { + auto CASWidth= cast(dyn_cast(Inst)->getCompareOperand()->getType())->getBitWidth(); + // Do not emit a trailing fence for cmpxchg seq_cst which are not emulated + if (isAcquireOrStronger(Ord)) + return (Ord == AtomicOrdering::SequentiallyConsistent + && CASWidth >= STI.getMinCmpXchgSizeInBits()) ? + nullptr : + Builder.CreateFence(AtomicOrdering::Acquire); + } else { + return TargetLoweringBase::emitTrailingFence(Builder, Inst, Ord); + } + return nullptr; +} + // Pin NVPTXTargetObjectFile's vtables to this file. NVPTXTargetObjectFile::~NVPTXTargetObjectFile() = default; diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index 5adf69d621552..1ee7a9b9ab8e3 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -17,6 +17,7 @@ #include "NVPTX.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/TargetLowering.h" +#include "llvm/Support/AtomicOrdering.h" namespace llvm { namespace NVPTXISD { @@ -260,6 +261,16 @@ class NVPTXTargetLowering : public TargetLowering { return true; } + bool shouldInsertFencesForAtomic(const Instruction *) const override; + + AtomicOrdering + atomicOperationOrderAfterFenceSplit(const Instruction *I) const override; + + Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, + AtomicOrdering Ord) const override; + Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, + AtomicOrdering Ord) const override; + private: const NVPTXSubtarget &STI; // cache the subtarget here SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT) const; diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 06c629c01d9ab..14744411bc74b 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -1962,41 +1962,41 @@ multiclass F_ATOMIC_2_NEG Pred> { let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in { def reg : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, regclass:$b, regclass:$c), - !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"), + !strconcat("atom", SemStr, SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"), [(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), (regT regclass:$b), (regT regclass:$c)))]>, Requires; def imm1 : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, IMMType:$b, regclass:$c), - !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"), + !strconcat("atom", SemStr, SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"), [(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), imm:$b, (regT regclass:$c)))]>, Requires; def imm2 : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, regclass:$b, IMMType:$c), - !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;", ""), + !strconcat("atom", SemStr, SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;", ""), [(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), (regT regclass:$b), imm:$c))]>, Requires; def imm3 : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, IMMType:$b, IMMType:$c), - !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"), + !strconcat("atom", SemStr, SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"), [(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), imm:$b, imm:$c))]>, Requires; } } -multiclass F_ATOMIC_3 Pred = []> { - defm p32 : F_ATOMIC_3_imp; - defm p64 : F_ATOMIC_3_imp; +multiclass F_ATOMIC_3 Pred = []> { + defm p32 : F_ATOMIC_3_imp; + defm p64 : F_ATOMIC_3_imp; } // atom_add @@ -2378,51 +2378,86 @@ defm INT_PTX_ATOM_XOR_GEN_64 : F_ATOMIC_2]>; -// atom_cas - -def atomic_cmp_swap_i16_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c), - (atomic_cmp_swap_i16 node:$a, node:$b, node:$c)>; -def atomic_cmp_swap_i16_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c), - (atomic_cmp_swap_i16 node:$a, node:$b, node:$c)>; -def atomic_cmp_swap_i16_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c), - (atomic_cmp_swap_i16 node:$a, node:$b, node:$c)>; -def atomic_cmp_swap_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c), - (atomic_cmp_swap_i32 node:$a, node:$b, node:$c)>; -def atomic_cmp_swap_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c), - (atomic_cmp_swap_i32 node:$a, node:$b, node:$c)>; -def atomic_cmp_swap_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c), - (atomic_cmp_swap_i32 node:$a, node:$b, node:$c)>; -def atomic_cmp_swap_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c), - (atomic_cmp_swap_i64 node:$a, node:$b, node:$c)>; -def atomic_cmp_swap_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c), - (atomic_cmp_swap_i64 node:$a, node:$b, node:$c)>; -def atomic_cmp_swap_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c), - (atomic_cmp_swap_i64 node:$a, node:$b, node:$c)>; - -defm INT_PTX_ATOM_CAS_G_16 : F_ATOMIC_3(NAME) node:$ptr, node:$cmp, node:$val), + AS_match.generic>; + + def NAME#_g: PatFrag<(ops node:$ptr, node:$cmp, node:$val), + (!cast(NAME) node:$ptr, node:$cmp, node:$val), + AS_match.global>; + + def NAME#_s: PatFrag<(ops node:$ptr, node:$cmp, node:$val), + (!cast(NAME) node:$ptr, node:$cmp, node:$val), + AS_match.shared>; +} + +// generate pattern fragments for size x memory order +// NOTE: i8 cmpxchg is not supported in ptx, and AtomicExpandPass will emulate all i8 cmpxchgs +// using larger-bitwidth cas +foreach size = ["i16", "i32", "i64"] in { + foreach order = ["", "_monotonic", "_acquire", "_release", "_acq_rel", "_seq_cst"] in { + defm atomic_cmp_swap#_#size#order: ternary_atomic_op_as; + } +} + +multiclass INT_PTX_ATOM_CAS_AS { + // eg. with type = 32, order = .acquire, atomic_cmp_swap_without_as = atomic_cmp_swap_i32_acquire, and _GEN becomes + // F_ATOMIC_3 + + + // For SM70, PTX63 +, memory orders are supported + defm _GEN: F_ATOMIC_3("i"#type), !cast("Int"#type#"Regs"), order, "", ".b"#type, ".cas", + !cast(atomic_cmp_swap_without_as#_gen), !cast("i"#type#"imm"), [hasSM<70>, hasPTX<63>]>; + + defm _GEN_USE_G: F_ATOMIC_3("i"#type), !cast("Int"#type#"Regs"), order, ".global", ".b"#type, ".cas", + !cast(atomic_cmp_swap_without_as#_gen), !cast("i"#type#"imm"), [hasSM<70>, hasPTX<63>]>; + + defm _G: F_ATOMIC_3("i"#type), !cast("Int"#type#"Regs"), order, ".global", ".b"#type, ".cas", + !cast(atomic_cmp_swap_without_as#_g), !cast("i"#type#"imm"), [hasSM<70>, hasPTX<63>]>; + + defm _S: F_ATOMIC_3("i"#type), !cast("Int"#type#"Regs"), order, ".shared", ".b"#type, ".cas", + !cast(atomic_cmp_swap_without_as#_s), !cast("i"#type#"imm"), [hasSM<70>, hasPTX<63>]>; + + // For older archs, we fall back to lowering to relaxed atom.cas + defm _GEN_OLD: F_ATOMIC_3("i"#type), !cast("Int"#type#"Regs"), "", "", ".b"#type, ".cas", + !cast(atomic_cmp_swap_without_as#_gen), !cast("i"#type#"imm")>; + + defm _GEN_USE_G_OLD: F_ATOMIC_3("i"#type), !cast("Int"#type#"Regs"), "", ".global", ".b"#type, ".cas", + !cast(atomic_cmp_swap_without_as#_gen), !cast("i"#type#"imm")>; + + defm _G_OLD: F_ATOMIC_3("i"#type), !cast("Int"#type#"Regs"), "", ".global", ".b"#type, ".cas", + !cast(atomic_cmp_swap_without_as#_g), !cast("i"#type#"imm")>; + + defm _S_OLD: F_ATOMIC_3("i"#type), !cast("Int"#type#"Regs"), "", ".shared", ".b"#type, ".cas", + !cast(atomic_cmp_swap_without_as#_s), !cast("i"#type#"imm")>; +} + +// Define atom.cas for all combinations of size x memory order supported in PTX *and* on the hardware. +foreach size = ["32", "64"] in { + // We enumerate the instructions with specific memory orders ["acquire", "release", "acq_rel"] + // *before* the instructions with the unspecified memory order [""]. + // This ensures that `cmpxchg ... acquire` is lowered to "atom.acquire.cas" instead of "atom.cas" + // Note that AtomicExpand will convert cmpxchg seq_cst to a cmpxchg monotonic with fences around it. + foreach order = ["acquire", "release", "acq_rel", ""] in { + if !eq(order, "") then { + defm INT_PTX_ATOM_CAS_#size : INT_PTX_ATOM_CAS_AS<"atomic_cmp_swap_i"#size, size, ".relaxed">; + } else { + defm INT_PTX_ATOM_CAS_#size#_#order : INT_PTX_ATOM_CAS_AS<"atomic_cmp_swap_i"#size#_#order, size, "."#order>; + } + } +} + +// Note that 16-bit CAS support in PTX is *emulated*. (TODO: public?) +defm INT_PTX_ATOM_CAS_G_16 : F_ATOMIC_3, hasPTX<63>]>; -defm INT_PTX_ATOM_CAS_S_16 : F_ATOMIC_3, hasPTX<63>]>; -defm INT_PTX_ATOM_CAS_GEN_16 : F_ATOMIC_3, hasPTX<63>]>; -defm INT_PTX_ATOM_CAS_GEN_16_USE_G : F_ATOMIC_3, hasPTX<63>]>; -defm INT_PTX_ATOM_CAS_G_32 : F_ATOMIC_3; -defm INT_PTX_ATOM_CAS_S_32 : F_ATOMIC_3; -defm INT_PTX_ATOM_CAS_GEN_32 : F_ATOMIC_3; -defm INT_PTX_ATOM_CAS_GEN_32_USE_G : F_ATOMIC_3; -defm INT_PTX_ATOM_CAS_G_64 : F_ATOMIC_3; -defm INT_PTX_ATOM_CAS_S_64 : F_ATOMIC_3; -defm INT_PTX_ATOM_CAS_GEN_64 : F_ATOMIC_3; -defm INT_PTX_ATOM_CAS_GEN_64_USE_G : F_ATOMIC_3; // Support for scoped atomic operations. Matches // int_nvvm_atomic_{op}_{space}_{type}_{scope} diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index 851c9152e4cb8..f893f1aefec84 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -147,7 +147,7 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo { // set of equivalent memory operations with a scalar data-type, executed in // an unspecified order on the elements in the vector. unsigned getMaxRequiredAlignment() const { return 8; } - // Emulated loops with 32-bit/64-bit CAS generate better SASS than 16-bit CAS + // Get the smallest cmpxchg word size that the hardware supports. unsigned getMinCmpXchgSizeInBits() const { return 32; } unsigned getPTXVersion() const { return PTXVersion; } diff --git a/llvm/test/CodeGen/NVPTX/atomics.ll b/llvm/test/CodeGen/NVPTX/atomics.ll index fccc3a8844ffc..ac2bd8828b205 100644 --- a/llvm/test/CodeGen/NVPTX/atomics.ll +++ b/llvm/test/CodeGen/NVPTX/atomics.ll @@ -177,7 +177,7 @@ define float @atomicrmw_add_f32_generic(ptr %addr, float %val) { ; CHECK-LABEL: atomicrmw_add_f16_generic define half @atomicrmw_add_f16_generic(ptr %addr, half %val) { -; CHECK: atom.cas +; CHECK: atom.acquire.cas %ret = atomicrmw fadd ptr %addr, half %val seq_cst ret half %ret } @@ -198,14 +198,14 @@ define float @atomicrmw_add_f32_addrspace3(ptr addrspace(3) %addr, float %val) { ; CHECK-LABEL: atomic_cmpxchg_i32 define i32 @atomic_cmpxchg_i32(ptr %addr, i32 %cmp, i32 %new) { -; CHECK: atom.cas.b32 +; CHECK: atom.acquire.cas.b32 %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst ret i32 %new } ; CHECK-LABEL: atomic_cmpxchg_i64 define i64 @atomic_cmpxchg_i64(ptr %addr, i64 %cmp, i64 %new) { -; CHECK: atom.cas.b64 +; CHECK: atom.acquire.cas.b64 %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst ret i64 %new } diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll new file mode 100644 index 0000000000000..ea308c2a7673b --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll @@ -0,0 +1,5680 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_60 -mattr=+ptx50 | FileCheck %s --check-prefix=SM60 +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_60 -mattr=+ptx50 | %ptxas-verify -arch=sm_60 %} + +define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB0_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB0_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB0_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB0_1; +; SM60-NEXT: $L__BB0_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic + ret i8 %new +} + +define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB1_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB1_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB1_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB1_1; +; SM60-NEXT: $L__BB1_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic + ret i8 %new +} + +define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB2_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB2_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB2_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB2_1; +; SM60-NEXT: $L__BB2_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic + ret i8 %new +} + +define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB3_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB3_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB3_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB3_1; +; SM60-NEXT: $L__BB3_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire + ret i8 %new +} + +define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB4_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB4_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB4_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB4_1; +; SM60-NEXT: $L__BB4_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire + ret i8 %new +} + +define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB5_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB5_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB5_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB5_1; +; SM60-NEXT: $L__BB5_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire + ret i8 %new +} + +define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB6_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB6_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB6_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB6_1; +; SM60-NEXT: $L__BB6_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst + ret i8 %new +} + +define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB7_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB7_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB7_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB7_1; +; SM60-NEXT: $L__BB7_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst + ret i8 %new +} + +define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB8_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB8_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB8_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB8_1; +; SM60-NEXT: $L__BB8_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst + ret i8 %new +} + +define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB9_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB9_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB9_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB9_1; +; SM60-NEXT: $L__BB9_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic + ret i8 %new +} + +define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB10_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB10_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB10_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB10_1; +; SM60-NEXT: $L__BB10_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic + ret i8 %new +} + +define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB11_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB11_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB11_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB11_1; +; SM60-NEXT: $L__BB11_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic + ret i8 %new +} + +define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB12_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB12_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB12_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB12_1; +; SM60-NEXT: $L__BB12_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire + ret i8 %new +} + +define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB13_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB13_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB13_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB13_1; +; SM60-NEXT: $L__BB13_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire + ret i8 %new +} + +define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB14_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB14_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB14_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB14_1; +; SM60-NEXT: $L__BB14_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire + ret i8 %new +} + +define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB15_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB15_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB15_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB15_1; +; SM60-NEXT: $L__BB15_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst + ret i8 %new +} + +define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB16_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB16_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB16_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB16_1; +; SM60-NEXT: $L__BB16_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst + ret i8 %new +} + +define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB17_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB17_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB17_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB17_1; +; SM60-NEXT: $L__BB17_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst + ret i8 %new +} + +define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB18_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB18_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB18_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB18_1; +; SM60-NEXT: $L__BB18_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic + ret i8 %new +} + +define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB19_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB19_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB19_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB19_1; +; SM60-NEXT: $L__BB19_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic + ret i8 %new +} + +define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB20_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB20_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB20_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB20_1; +; SM60-NEXT: $L__BB20_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic + ret i8 %new +} + +define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB21_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB21_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB21_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB21_1; +; SM60-NEXT: $L__BB21_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire + ret i8 %new +} + +define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB22_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB22_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB22_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB22_1; +; SM60-NEXT: $L__BB22_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire + ret i8 %new +} + +define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB23_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB23_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB23_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB23_1; +; SM60-NEXT: $L__BB23_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire + ret i8 %new +} + +define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB24_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB24_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB24_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB24_1; +; SM60-NEXT: $L__BB24_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB25_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB25_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB25_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB25_1; +; SM60-NEXT: $L__BB25_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB26_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB26_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB26_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB26_1; +; SM60-NEXT: $L__BB26_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB27_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB27_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB27_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB27_1; +; SM60-NEXT: $L__BB27_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB28_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB28_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB28_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB28_1; +; SM60-NEXT: $L__BB28_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB29_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB29_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB29_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB29_1; +; SM60-NEXT: $L__BB29_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB30_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB30_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB30_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB30_1; +; SM60-NEXT: $L__BB30_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB31_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB31_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB31_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB31_1; +; SM60-NEXT: $L__BB31_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB32_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB32_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB32_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB32_1; +; SM60-NEXT: $L__BB32_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB33_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB33_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB33_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB33_1; +; SM60-NEXT: $L__BB33_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB34_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB34_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB34_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB34_1; +; SM60-NEXT: $L__BB34_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB35_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB35_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB35_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB35_1; +; SM60-NEXT: $L__BB35_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB36_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB36_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB36_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB36_1; +; SM60-NEXT: $L__BB36_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB37_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB37_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB37_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB37_1; +; SM60-NEXT: $L__BB37_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB38_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB38_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB38_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB38_1; +; SM60-NEXT: $L__BB38_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB39_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB39_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB39_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB39_1; +; SM60-NEXT: $L__BB39_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB40_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB40_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB40_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB40_1; +; SM60-NEXT: $L__BB40_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB41_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB41_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB41_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB41_1; +; SM60-NEXT: $L__BB41_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB42_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB42_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB42_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB42_1; +; SM60-NEXT: $L__BB42_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB43_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB43_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB43_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB43_1; +; SM60-NEXT: $L__BB43_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB44_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB44_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB44_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB44_1; +; SM60-NEXT: $L__BB44_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst + ret i8 %new +} + +define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB45_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB45_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB45_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB45_1; +; SM60-NEXT: $L__BB45_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB46_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB46_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB46_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB46_1; +; SM60-NEXT: $L__BB46_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB47_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB47_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB47_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB47_1; +; SM60-NEXT: $L__BB47_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB48_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB48_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB48_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB48_1; +; SM60-NEXT: $L__BB48_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB49_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB49_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB49_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB49_1; +; SM60-NEXT: $L__BB49_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB50_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB50_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB50_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB50_1; +; SM60-NEXT: $L__BB50_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic acquire + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB51_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB51_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB51_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB51_1; +; SM60-NEXT: $L__BB51_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB52_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB52_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB52_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB52_1; +; SM60-NEXT: $L__BB52_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB53_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB53_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB53_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB53_1; +; SM60-NEXT: $L__BB53_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic seq_cst + ret i16 %new +} + +define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB54_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB54_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB54_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB54_1; +; SM60-NEXT: $L__BB54_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB55_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB55_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB55_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB55_1; +; SM60-NEXT: $L__BB55_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB56_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB56_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB56_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB56_1; +; SM60-NEXT: $L__BB56_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire monotonic + ret i16 %new +} + +define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB57_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB57_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB57_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB57_1; +; SM60-NEXT: $L__BB57_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB58_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB58_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB58_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB58_1; +; SM60-NEXT: $L__BB58_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB59_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB59_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB59_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB59_1; +; SM60-NEXT: $L__BB59_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire acquire + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB60_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB60_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB60_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB60_1; +; SM60-NEXT: $L__BB60_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB61_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB61_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB61_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB61_1; +; SM60-NEXT: $L__BB61_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB62_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB62_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB62_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB62_1; +; SM60-NEXT: $L__BB62_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire seq_cst + ret i16 %new +} + +define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB63_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB63_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB63_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB63_1; +; SM60-NEXT: $L__BB63_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB64_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB64_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB64_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB64_1; +; SM60-NEXT: $L__BB64_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB65_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB65_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB65_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB65_1; +; SM60-NEXT: $L__BB65_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release monotonic + ret i16 %new +} + +define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB66_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB66_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB66_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB66_1; +; SM60-NEXT: $L__BB66_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB67_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB67_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB67_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB67_1; +; SM60-NEXT: $L__BB67_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB68_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB68_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB68_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB68_1; +; SM60-NEXT: $L__BB68_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release acquire + ret i16 %new +} + +define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB69_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB69_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB69_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB69_1; +; SM60-NEXT: $L__BB69_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB70_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB70_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB70_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB70_1; +; SM60-NEXT: $L__BB70_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB71_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB71_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB71_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB71_1; +; SM60-NEXT: $L__BB71_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release seq_cst + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB72_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB72_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB72_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB72_1; +; SM60-NEXT: $L__BB72_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB73_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB73_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB73_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB73_1; +; SM60-NEXT: $L__BB73_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB74_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB74_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB74_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB74_1; +; SM60-NEXT: $L__BB74_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB75_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB75_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB75_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB75_1; +; SM60-NEXT: $L__BB75_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB76_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB76_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB76_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB76_1; +; SM60-NEXT: $L__BB76_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB77_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB77_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB77_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB77_1; +; SM60-NEXT: $L__BB77_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB78_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB78_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB78_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB78_1; +; SM60-NEXT: $L__BB78_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB79_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB79_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB79_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB79_1; +; SM60-NEXT: $L__BB79_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB80_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB80_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB80_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB80_1; +; SM60-NEXT: $L__BB80_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel seq_cst + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB81_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB81_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB81_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB81_1; +; SM60-NEXT: $L__BB81_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB82_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB82_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB82_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB82_1; +; SM60-NEXT: $L__BB82_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB83_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB83_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB83_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB83_1; +; SM60-NEXT: $L__BB83_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB84_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB84_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB84_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB84_1; +; SM60-NEXT: $L__BB84_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB85_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB85_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB85_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB85_1; +; SM60-NEXT: $L__BB85_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB86_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB86_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB86_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB86_1; +; SM60-NEXT: $L__BB86_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_seq_cst_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB87_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB87_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB87_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB87_1; +; SM60-NEXT: $L__BB87_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_seq_cst_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB88_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB88_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB88_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB88_1; +; SM60-NEXT: $L__BB88_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_seq_cst_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB89_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB89_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB89_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB89_1; +; SM60-NEXT: $L__BB89_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst seq_cst + ret i16 %new +} + +define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_monotonic_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_monotonic_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_monotonic_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_acquire_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_acquire_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_acquire_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic acquire + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_seq_cst_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_seq_cst_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_seq_cst_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic seq_cst + ret i32 %new +} + +define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_monotonic_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_monotonic_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_monotonic_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire monotonic + ret i32 %new +} + +define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_acquire_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_acquire_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_acquire_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire acquire + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_seq_cst_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_seq_cst_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_seq_cst_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire seq_cst + ret i32 %new +} + +define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_monotonic_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_monotonic_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_monotonic_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release monotonic + ret i32 %new +} + +define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_acquire_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_acquire_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_acquire_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release acquire + ret i32 %new +} + +define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_seq_cst_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_seq_cst_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_seq_cst_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release seq_cst + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_monotonic_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_monotonic_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_monotonic_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_seq_cst_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_seq_cst_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_seq_cst_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel seq_cst + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_monotonic_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_monotonic_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_monotonic_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_acquire_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_acquire_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_acquire_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_seq_cst_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_seq_cst_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_seq_cst_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst seq_cst + ret i32 %new +} + +define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_monotonic_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_monotonic_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_monotonic_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_acquire_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_acquire_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_acquire_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic acquire + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_seq_cst_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_seq_cst_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_seq_cst_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic seq_cst + ret i64 %new +} + +define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_monotonic_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_monotonic_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_monotonic_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire monotonic + ret i64 %new +} + +define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_acquire_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_acquire_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_acquire_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire acquire + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_seq_cst_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_seq_cst_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_seq_cst_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire seq_cst + ret i64 %new +} + +define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_monotonic_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_monotonic_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_monotonic_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release monotonic + ret i64 %new +} + +define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_acquire_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_acquire_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_acquire_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release acquire + ret i64 %new +} + +define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_seq_cst_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release seq_cst + ret i64 %new +} + +define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_seq_cst_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release seq_cst + ret i64 %new +} + +define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_seq_cst_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release seq_cst + ret i64 %new +} + +define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_monotonic_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel monotonic + ret i64 %new +} + +define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_monotonic_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel monotonic + ret i64 %new +} + +define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_monotonic_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel monotonic + ret i64 %new +} + +define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_acquire_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel acquire + ret i64 %new +} + +define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_acquire_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel acquire + ret i64 %new +} + +define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_acquire_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel acquire + ret i64 %new +} + +define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_seq_cst_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel seq_cst + ret i64 %new +} + +define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_seq_cst_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel seq_cst + ret i64 %new +} + +define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_seq_cst_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel seq_cst + ret i64 %new +} + +define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_monotonic_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst monotonic + ret i64 %new +} + +define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_monotonic_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst monotonic + ret i64 %new +} + +define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_monotonic_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst monotonic + ret i64 %new +} + +define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_acquire_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst acquire + ret i64 %new +} + +define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_acquire_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst acquire + ret i64 %new +} + +define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_acquire_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst acquire + ret i64 %new +} + +define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_seq_cst_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst + ret i64 %new +} + +define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_seq_cst_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst seq_cst + ret i64 %new +} + +define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_seq_cst_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst seq_cst + ret i64 %new +} + diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll new file mode 100644 index 0000000000000..4360ea36e863a --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll @@ -0,0 +1,5680 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | FileCheck %s --check-prefix=SM70 +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %} + +define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB0_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB0_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB0_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB0_1; +; SM70-NEXT: $L__BB0_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic + ret i8 %new +} + +define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB1_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB1_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB1_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB1_1; +; SM70-NEXT: $L__BB1_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic + ret i8 %new +} + +define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB2_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB2_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB2_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB2_1; +; SM70-NEXT: $L__BB2_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic + ret i8 %new +} + +define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB3_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB3_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB3_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB3_1; +; SM70-NEXT: $L__BB3_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire + ret i8 %new +} + +define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB4_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB4_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB4_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB4_1; +; SM70-NEXT: $L__BB4_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire + ret i8 %new +} + +define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB5_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB5_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB5_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB5_1; +; SM70-NEXT: $L__BB5_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire + ret i8 %new +} + +define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB6_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB6_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB6_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB6_1; +; SM70-NEXT: $L__BB6_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst + ret i8 %new +} + +define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB7_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB7_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB7_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB7_1; +; SM70-NEXT: $L__BB7_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst + ret i8 %new +} + +define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB8_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB8_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB8_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB8_1; +; SM70-NEXT: $L__BB8_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst + ret i8 %new +} + +define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB9_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB9_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB9_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB9_1; +; SM70-NEXT: $L__BB9_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic + ret i8 %new +} + +define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB10_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB10_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB10_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB10_1; +; SM70-NEXT: $L__BB10_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic + ret i8 %new +} + +define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB11_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB11_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB11_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB11_1; +; SM70-NEXT: $L__BB11_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic + ret i8 %new +} + +define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB12_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB12_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB12_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB12_1; +; SM70-NEXT: $L__BB12_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire + ret i8 %new +} + +define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB13_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB13_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB13_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB13_1; +; SM70-NEXT: $L__BB13_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire + ret i8 %new +} + +define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB14_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB14_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB14_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB14_1; +; SM70-NEXT: $L__BB14_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire + ret i8 %new +} + +define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB15_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB15_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB15_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB15_1; +; SM70-NEXT: $L__BB15_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst + ret i8 %new +} + +define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB16_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB16_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB16_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB16_1; +; SM70-NEXT: $L__BB16_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst + ret i8 %new +} + +define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB17_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB17_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB17_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB17_1; +; SM70-NEXT: $L__BB17_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst + ret i8 %new +} + +define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB18_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB18_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB18_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB18_1; +; SM70-NEXT: $L__BB18_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic + ret i8 %new +} + +define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB19_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB19_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB19_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB19_1; +; SM70-NEXT: $L__BB19_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic + ret i8 %new +} + +define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB20_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB20_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB20_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB20_1; +; SM70-NEXT: $L__BB20_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic + ret i8 %new +} + +define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB21_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB21_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB21_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB21_1; +; SM70-NEXT: $L__BB21_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire + ret i8 %new +} + +define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB22_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB22_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB22_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB22_1; +; SM70-NEXT: $L__BB22_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire + ret i8 %new +} + +define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB23_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB23_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB23_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB23_1; +; SM70-NEXT: $L__BB23_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire + ret i8 %new +} + +define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB24_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB24_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB24_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB24_1; +; SM70-NEXT: $L__BB24_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB25_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB25_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB25_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB25_1; +; SM70-NEXT: $L__BB25_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB26_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB26_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB26_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB26_1; +; SM70-NEXT: $L__BB26_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB27_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB27_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB27_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB27_1; +; SM70-NEXT: $L__BB27_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB28_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB28_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB28_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB28_1; +; SM70-NEXT: $L__BB28_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB29_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB29_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB29_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB29_1; +; SM70-NEXT: $L__BB29_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB30_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB30_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB30_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB30_1; +; SM70-NEXT: $L__BB30_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB31_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB31_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB31_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB31_1; +; SM70-NEXT: $L__BB31_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB32_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB32_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB32_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB32_1; +; SM70-NEXT: $L__BB32_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB33_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB33_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB33_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB33_1; +; SM70-NEXT: $L__BB33_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB34_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB34_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB34_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB34_1; +; SM70-NEXT: $L__BB34_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB35_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB35_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB35_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB35_1; +; SM70-NEXT: $L__BB35_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB36_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB36_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB36_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB36_1; +; SM70-NEXT: $L__BB36_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB37_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB37_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB37_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB37_1; +; SM70-NEXT: $L__BB37_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB38_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB38_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB38_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB38_1; +; SM70-NEXT: $L__BB38_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB39_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB39_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB39_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB39_1; +; SM70-NEXT: $L__BB39_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB40_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB40_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB40_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB40_1; +; SM70-NEXT: $L__BB40_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB41_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB41_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB41_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB41_1; +; SM70-NEXT: $L__BB41_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB42_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB42_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB42_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB42_1; +; SM70-NEXT: $L__BB42_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB43_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB43_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB43_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB43_1; +; SM70-NEXT: $L__BB43_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB44_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB44_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB44_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB44_1; +; SM70-NEXT: $L__BB44_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst + ret i8 %new +} + +define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB45_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB45_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB45_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB45_1; +; SM70-NEXT: $L__BB45_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB46_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB46_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB46_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB46_1; +; SM70-NEXT: $L__BB46_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB47_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB47_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB47_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB47_1; +; SM70-NEXT: $L__BB47_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB48_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB48_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB48_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB48_1; +; SM70-NEXT: $L__BB48_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB49_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB49_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB49_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB49_1; +; SM70-NEXT: $L__BB49_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB50_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB50_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB50_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB50_1; +; SM70-NEXT: $L__BB50_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic acquire + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB51_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB51_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB51_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB51_1; +; SM70-NEXT: $L__BB51_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB52_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB52_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB52_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB52_1; +; SM70-NEXT: $L__BB52_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB53_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB53_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB53_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB53_1; +; SM70-NEXT: $L__BB53_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic seq_cst + ret i16 %new +} + +define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB54_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB54_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB54_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB54_1; +; SM70-NEXT: $L__BB54_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB55_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB55_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB55_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB55_1; +; SM70-NEXT: $L__BB55_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB56_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB56_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB56_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB56_1; +; SM70-NEXT: $L__BB56_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire monotonic + ret i16 %new +} + +define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB57_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB57_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB57_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB57_1; +; SM70-NEXT: $L__BB57_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB58_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB58_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB58_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB58_1; +; SM70-NEXT: $L__BB58_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB59_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB59_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB59_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB59_1; +; SM70-NEXT: $L__BB59_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire acquire + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB60_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB60_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB60_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB60_1; +; SM70-NEXT: $L__BB60_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB61_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB61_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB61_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB61_1; +; SM70-NEXT: $L__BB61_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB62_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB62_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB62_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB62_1; +; SM70-NEXT: $L__BB62_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire seq_cst + ret i16 %new +} + +define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB63_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB63_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB63_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB63_1; +; SM70-NEXT: $L__BB63_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB64_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB64_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB64_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB64_1; +; SM70-NEXT: $L__BB64_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB65_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB65_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB65_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB65_1; +; SM70-NEXT: $L__BB65_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release monotonic + ret i16 %new +} + +define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB66_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB66_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB66_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB66_1; +; SM70-NEXT: $L__BB66_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB67_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB67_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB67_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB67_1; +; SM70-NEXT: $L__BB67_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB68_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB68_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB68_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB68_1; +; SM70-NEXT: $L__BB68_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release acquire + ret i16 %new +} + +define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB69_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB69_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB69_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB69_1; +; SM70-NEXT: $L__BB69_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB70_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB70_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB70_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB70_1; +; SM70-NEXT: $L__BB70_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB71_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB71_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB71_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB71_1; +; SM70-NEXT: $L__BB71_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release seq_cst + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB72_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB72_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB72_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB72_1; +; SM70-NEXT: $L__BB72_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB73_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB73_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB73_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB73_1; +; SM70-NEXT: $L__BB73_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB74_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB74_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB74_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB74_1; +; SM70-NEXT: $L__BB74_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB75_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB75_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB75_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB75_1; +; SM70-NEXT: $L__BB75_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB76_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB76_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB76_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB76_1; +; SM70-NEXT: $L__BB76_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB77_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB77_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB77_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB77_1; +; SM70-NEXT: $L__BB77_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB78_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB78_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB78_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB78_1; +; SM70-NEXT: $L__BB78_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB79_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB79_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB79_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB79_1; +; SM70-NEXT: $L__BB79_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB80_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB80_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB80_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB80_1; +; SM70-NEXT: $L__BB80_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel seq_cst + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB81_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB81_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB81_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB81_1; +; SM70-NEXT: $L__BB81_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB82_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB82_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB82_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB82_1; +; SM70-NEXT: $L__BB82_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB83_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB83_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB83_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB83_1; +; SM70-NEXT: $L__BB83_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_acquire_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB84_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB84_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB84_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB84_1; +; SM70-NEXT: $L__BB84_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_acquire_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB85_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB85_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB85_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB85_1; +; SM70-NEXT: $L__BB85_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_acquire_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB86_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB86_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB86_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB86_1; +; SM70-NEXT: $L__BB86_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_seq_cst_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB87_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB87_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB87_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB87_1; +; SM70-NEXT: $L__BB87_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_seq_cst_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB88_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB88_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB88_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB88_1; +; SM70-NEXT: $L__BB88_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_seq_cst_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB89_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB89_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB89_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB89_1; +; SM70-NEXT: $L__BB89_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst seq_cst + ret i16 %new +} + +define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_monotonic_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_param_2]; +; SM70-NEXT: atom.relaxed.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_monotonic_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_param_2]; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_monotonic_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_param_2]; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_acquire_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_param_2]; +; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_acquire_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_param_2]; +; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_acquire_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_param_2]; +; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic acquire + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_seq_cst_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_param_2]; +; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_seq_cst_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_param_2]; +; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_seq_cst_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_param_2]; +; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic seq_cst + ret i32 %new +} + +define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_monotonic_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_param_2]; +; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_monotonic_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_param_2]; +; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_monotonic_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_param_2]; +; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire monotonic + ret i32 %new +} + +define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_acquire_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_param_2]; +; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_acquire_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_param_2]; +; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_acquire_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_param_2]; +; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire acquire + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_seq_cst_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_param_2]; +; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_seq_cst_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_param_2]; +; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_seq_cst_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_param_2]; +; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire seq_cst + ret i32 %new +} + +define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_monotonic_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_param_2]; +; SM70-NEXT: atom.release.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_monotonic_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_param_2]; +; SM70-NEXT: atom.release.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_monotonic_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_param_2]; +; SM70-NEXT: atom.release.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release monotonic + ret i32 %new +} + +define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_acquire_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_param_2]; +; SM70-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_acquire_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_param_2]; +; SM70-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_acquire_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_param_2]; +; SM70-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release acquire + ret i32 %new +} + +define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_seq_cst_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_param_2]; +; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_seq_cst_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_param_2]; +; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_seq_cst_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_param_2]; +; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release seq_cst + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_monotonic_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_param_2]; +; SM70-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_monotonic_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_param_2]; +; SM70-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_monotonic_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_param_2]; +; SM70-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_param_2]; +; SM70-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_param_2]; +; SM70-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_param_2]; +; SM70-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_seq_cst_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_param_2]; +; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_seq_cst_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_param_2]; +; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_seq_cst_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_param_2]; +; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel seq_cst + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_monotonic_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_param_2]; +; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_monotonic_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_param_2]; +; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_monotonic_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_param_2]; +; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_acquire_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_param_2]; +; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_acquire_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_param_2]; +; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_acquire_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_param_2]; +; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_seq_cst_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_param_2]; +; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_seq_cst_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_param_2]; +; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_seq_cst_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_param_2]; +; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst seq_cst + ret i32 %new +} + +define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_monotonic_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_param_2]; +; SM70-NEXT: atom.relaxed.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_monotonic_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_param_2]; +; SM70-NEXT: atom.relaxed.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_monotonic_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_param_2]; +; SM70-NEXT: atom.relaxed.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_acquire_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_param_2]; +; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_acquire_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_param_2]; +; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_acquire_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_param_2]; +; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic acquire + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_seq_cst_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_param_2]; +; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_seq_cst_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_param_2]; +; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_seq_cst_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_param_2]; +; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic seq_cst + ret i64 %new +} + +define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_monotonic_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_param_2]; +; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_monotonic_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_param_2]; +; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_monotonic_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_param_2]; +; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire monotonic + ret i64 %new +} + +define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_acquire_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_param_2]; +; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_acquire_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_param_2]; +; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_acquire_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_param_2]; +; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire acquire + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_seq_cst_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_param_2]; +; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_seq_cst_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_param_2]; +; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_seq_cst_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_param_2]; +; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire seq_cst + ret i64 %new +} + +define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_monotonic_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_param_2]; +; SM70-NEXT: atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_monotonic_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_param_2]; +; SM70-NEXT: atom.release.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_monotonic_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_param_2]; +; SM70-NEXT: atom.release.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release monotonic + ret i64 %new +} + +define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_acquire_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_param_2]; +; SM70-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_acquire_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_param_2]; +; SM70-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_acquire_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_param_2]; +; SM70-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release acquire + ret i64 %new +} + +define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_seq_cst_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_param_2]; +; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release seq_cst + ret i64 %new +} + +define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_seq_cst_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_param_2]; +; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release seq_cst + ret i64 %new +} + +define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_seq_cst_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_param_2]; +; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release seq_cst + ret i64 %new +} + +define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_monotonic_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_param_2]; +; SM70-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel monotonic + ret i64 %new +} + +define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_monotonic_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_param_2]; +; SM70-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel monotonic + ret i64 %new +} + +define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_monotonic_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_param_2]; +; SM70-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel monotonic + ret i64 %new +} + +define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_acquire_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_param_2]; +; SM70-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel acquire + ret i64 %new +} + +define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_acquire_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_param_2]; +; SM70-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel acquire + ret i64 %new +} + +define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_acquire_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_param_2]; +; SM70-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel acquire + ret i64 %new +} + +define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_seq_cst_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_param_2]; +; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel seq_cst + ret i64 %new +} + +define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_seq_cst_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_param_2]; +; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel seq_cst + ret i64 %new +} + +define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_seq_cst_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_param_2]; +; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel seq_cst + ret i64 %new +} + +define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_monotonic_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_param_2]; +; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst monotonic + ret i64 %new +} + +define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_monotonic_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_param_2]; +; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst monotonic + ret i64 %new +} + +define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_monotonic_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_param_2]; +; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst monotonic + ret i64 %new +} + +define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_acquire_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_param_2]; +; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst acquire + ret i64 %new +} + +define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_acquire_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_param_2]; +; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst acquire + ret i64 %new +} + +define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_acquire_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_param_2]; +; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst acquire + ret i64 %new +} + +define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_seq_cst_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_param_2]; +; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst + ret i64 %new +} + +define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_seq_cst_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_param_2]; +; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst seq_cst + ret i64 %new +} + +define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_seq_cst_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_param_2]; +; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst seq_cst + ret i64 %new +} + diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll new file mode 100644 index 0000000000000..5acb275a6f581 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll @@ -0,0 +1,5680 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | FileCheck %s --check-prefix=SM90 +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %} + +define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB0_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB0_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB0_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB0_1; +; SM90-NEXT: $L__BB0_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic + ret i8 %new +} + +define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB1_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB1_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB1_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB1_1; +; SM90-NEXT: $L__BB1_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic + ret i8 %new +} + +define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB2_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB2_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB2_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB2_1; +; SM90-NEXT: $L__BB2_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic + ret i8 %new +} + +define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB3_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB3_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB3_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB3_1; +; SM90-NEXT: $L__BB3_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire + ret i8 %new +} + +define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB4_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB4_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB4_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB4_1; +; SM90-NEXT: $L__BB4_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire + ret i8 %new +} + +define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB5_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB5_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB5_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB5_1; +; SM90-NEXT: $L__BB5_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire + ret i8 %new +} + +define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB6_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB6_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB6_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB6_1; +; SM90-NEXT: $L__BB6_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst + ret i8 %new +} + +define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB7_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB7_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB7_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB7_1; +; SM90-NEXT: $L__BB7_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst + ret i8 %new +} + +define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB8_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB8_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB8_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB8_1; +; SM90-NEXT: $L__BB8_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst + ret i8 %new +} + +define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB9_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB9_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB9_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB9_1; +; SM90-NEXT: $L__BB9_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic + ret i8 %new +} + +define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB10_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB10_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB10_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB10_1; +; SM90-NEXT: $L__BB10_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic + ret i8 %new +} + +define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB11_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB11_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB11_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB11_1; +; SM90-NEXT: $L__BB11_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic + ret i8 %new +} + +define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB12_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB12_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB12_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB12_1; +; SM90-NEXT: $L__BB12_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire + ret i8 %new +} + +define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB13_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB13_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB13_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB13_1; +; SM90-NEXT: $L__BB13_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire + ret i8 %new +} + +define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB14_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB14_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB14_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB14_1; +; SM90-NEXT: $L__BB14_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire + ret i8 %new +} + +define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB15_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB15_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB15_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB15_1; +; SM90-NEXT: $L__BB15_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst + ret i8 %new +} + +define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB16_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB16_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB16_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB16_1; +; SM90-NEXT: $L__BB16_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst + ret i8 %new +} + +define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB17_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB17_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB17_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB17_1; +; SM90-NEXT: $L__BB17_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst + ret i8 %new +} + +define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB18_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB18_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB18_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB18_1; +; SM90-NEXT: $L__BB18_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic + ret i8 %new +} + +define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB19_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB19_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB19_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB19_1; +; SM90-NEXT: $L__BB19_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic + ret i8 %new +} + +define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB20_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB20_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB20_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB20_1; +; SM90-NEXT: $L__BB20_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic + ret i8 %new +} + +define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB21_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB21_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB21_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB21_1; +; SM90-NEXT: $L__BB21_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire + ret i8 %new +} + +define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB22_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB22_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB22_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB22_1; +; SM90-NEXT: $L__BB22_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire + ret i8 %new +} + +define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB23_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB23_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB23_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB23_1; +; SM90-NEXT: $L__BB23_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire + ret i8 %new +} + +define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB24_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB24_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB24_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB24_1; +; SM90-NEXT: $L__BB24_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB25_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB25_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB25_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB25_1; +; SM90-NEXT: $L__BB25_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB26_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB26_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB26_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB26_1; +; SM90-NEXT: $L__BB26_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB27_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB27_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB27_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB27_1; +; SM90-NEXT: $L__BB27_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB28_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB28_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB28_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB28_1; +; SM90-NEXT: $L__BB28_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB29_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB29_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB29_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB29_1; +; SM90-NEXT: $L__BB29_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB30_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB30_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB30_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB30_1; +; SM90-NEXT: $L__BB30_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB31_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB31_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB31_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB31_1; +; SM90-NEXT: $L__BB31_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB32_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB32_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB32_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB32_1; +; SM90-NEXT: $L__BB32_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB33_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB33_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB33_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB33_1; +; SM90-NEXT: $L__BB33_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB34_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB34_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB34_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB34_1; +; SM90-NEXT: $L__BB34_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB35_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB35_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB35_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB35_1; +; SM90-NEXT: $L__BB35_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB36_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB36_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB36_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB36_1; +; SM90-NEXT: $L__BB36_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB37_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB37_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB37_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB37_1; +; SM90-NEXT: $L__BB37_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB38_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB38_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB38_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB38_1; +; SM90-NEXT: $L__BB38_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB39_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB39_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB39_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB39_1; +; SM90-NEXT: $L__BB39_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB40_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB40_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB40_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB40_1; +; SM90-NEXT: $L__BB40_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB41_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB41_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB41_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB41_1; +; SM90-NEXT: $L__BB41_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB42_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB42_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB42_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB42_1; +; SM90-NEXT: $L__BB42_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB43_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB43_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB43_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB43_1; +; SM90-NEXT: $L__BB43_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB44_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB44_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB44_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB44_1; +; SM90-NEXT: $L__BB44_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst + ret i8 %new +} + +define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB45_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB45_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB45_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB45_1; +; SM90-NEXT: $L__BB45_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB46_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB46_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB46_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB46_1; +; SM90-NEXT: $L__BB46_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB47_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB47_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB47_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB47_1; +; SM90-NEXT: $L__BB47_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB48_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB48_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB48_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB48_1; +; SM90-NEXT: $L__BB48_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB49_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB49_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB49_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB49_1; +; SM90-NEXT: $L__BB49_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB50_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB50_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB50_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB50_1; +; SM90-NEXT: $L__BB50_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic acquire + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB51_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB51_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB51_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB51_1; +; SM90-NEXT: $L__BB51_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB52_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB52_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB52_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB52_1; +; SM90-NEXT: $L__BB52_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB53_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB53_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB53_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB53_1; +; SM90-NEXT: $L__BB53_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic seq_cst + ret i16 %new +} + +define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB54_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB54_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB54_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB54_1; +; SM90-NEXT: $L__BB54_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB55_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB55_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB55_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB55_1; +; SM90-NEXT: $L__BB55_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB56_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB56_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB56_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB56_1; +; SM90-NEXT: $L__BB56_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire monotonic + ret i16 %new +} + +define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB57_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB57_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB57_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB57_1; +; SM90-NEXT: $L__BB57_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB58_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB58_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB58_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB58_1; +; SM90-NEXT: $L__BB58_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB59_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB59_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB59_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB59_1; +; SM90-NEXT: $L__BB59_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire acquire + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB60_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB60_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB60_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB60_1; +; SM90-NEXT: $L__BB60_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB61_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB61_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB61_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB61_1; +; SM90-NEXT: $L__BB61_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB62_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB62_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB62_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB62_1; +; SM90-NEXT: $L__BB62_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire seq_cst + ret i16 %new +} + +define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB63_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB63_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB63_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB63_1; +; SM90-NEXT: $L__BB63_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB64_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB64_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB64_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB64_1; +; SM90-NEXT: $L__BB64_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB65_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB65_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB65_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB65_1; +; SM90-NEXT: $L__BB65_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release monotonic + ret i16 %new +} + +define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB66_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB66_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB66_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB66_1; +; SM90-NEXT: $L__BB66_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB67_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB67_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB67_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB67_1; +; SM90-NEXT: $L__BB67_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB68_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB68_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB68_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB68_1; +; SM90-NEXT: $L__BB68_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release acquire + ret i16 %new +} + +define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB69_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB69_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB69_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB69_1; +; SM90-NEXT: $L__BB69_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB70_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB70_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB70_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB70_1; +; SM90-NEXT: $L__BB70_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB71_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB71_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB71_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB71_1; +; SM90-NEXT: $L__BB71_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release seq_cst + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB72_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB72_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB72_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB72_1; +; SM90-NEXT: $L__BB72_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB73_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB73_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB73_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB73_1; +; SM90-NEXT: $L__BB73_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB74_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB74_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB74_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB74_1; +; SM90-NEXT: $L__BB74_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB75_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB75_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB75_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB75_1; +; SM90-NEXT: $L__BB75_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB76_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB76_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB76_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB76_1; +; SM90-NEXT: $L__BB76_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB77_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB77_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB77_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB77_1; +; SM90-NEXT: $L__BB77_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB78_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB78_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB78_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB78_1; +; SM90-NEXT: $L__BB78_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB79_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB79_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB79_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB79_1; +; SM90-NEXT: $L__BB79_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB80_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB80_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB80_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB80_1; +; SM90-NEXT: $L__BB80_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel seq_cst + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB81_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB81_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB81_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB81_1; +; SM90-NEXT: $L__BB81_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB82_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB82_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB82_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB82_1; +; SM90-NEXT: $L__BB82_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB83_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB83_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB83_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB83_1; +; SM90-NEXT: $L__BB83_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB84_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB84_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB84_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB84_1; +; SM90-NEXT: $L__BB84_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB85_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB85_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB85_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB85_1; +; SM90-NEXT: $L__BB85_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB86_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB86_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB86_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB86_1; +; SM90-NEXT: $L__BB86_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB87_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB87_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB87_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB87_1; +; SM90-NEXT: $L__BB87_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB88_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB88_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB88_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB88_1; +; SM90-NEXT: $L__BB88_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB89_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB89_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB89_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB89_1; +; SM90-NEXT: $L__BB89_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst seq_cst + ret i16 %new +} + +define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_param_2]; +; SM90-NEXT: atom.relaxed.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_param_2]; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_param_2]; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_param_2]; +; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_param_2]; +; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_param_2]; +; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic acquire + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_param_2]; +; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_param_2]; +; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_param_2]; +; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic seq_cst + ret i32 %new +} + +define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_param_2]; +; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_param_2]; +; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_param_2]; +; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire monotonic + ret i32 %new +} + +define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_param_2]; +; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_param_2]; +; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_param_2]; +; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire acquire + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_param_2]; +; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_param_2]; +; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_param_2]; +; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire seq_cst + ret i32 %new +} + +define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_param_2]; +; SM90-NEXT: atom.release.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_param_2]; +; SM90-NEXT: atom.release.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_param_2]; +; SM90-NEXT: atom.release.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release monotonic + ret i32 %new +} + +define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_param_2]; +; SM90-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_param_2]; +; SM90-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_param_2]; +; SM90-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release acquire + ret i32 %new +} + +define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_param_2]; +; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_param_2]; +; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_param_2]; +; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release seq_cst + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_param_2]; +; SM90-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_param_2]; +; SM90-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_param_2]; +; SM90-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_param_2]; +; SM90-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_param_2]; +; SM90-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_param_2]; +; SM90-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_param_2]; +; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_param_2]; +; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_param_2]; +; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel seq_cst + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_param_2]; +; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_param_2]; +; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_param_2]; +; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_param_2]; +; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_param_2]; +; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_param_2]; +; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_param_2]; +; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_param_2]; +; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_param_2]; +; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst seq_cst + ret i32 %new +} + +define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_param_2]; +; SM90-NEXT: atom.relaxed.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_param_2]; +; SM90-NEXT: atom.relaxed.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_param_2]; +; SM90-NEXT: atom.relaxed.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_param_2]; +; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_param_2]; +; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_param_2]; +; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic acquire + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_param_2]; +; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_param_2]; +; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_param_2]; +; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic seq_cst + ret i64 %new +} + +define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_param_2]; +; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_param_2]; +; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_param_2]; +; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire monotonic + ret i64 %new +} + +define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_param_2]; +; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_param_2]; +; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_param_2]; +; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire acquire + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_param_2]; +; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_param_2]; +; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_param_2]; +; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire seq_cst + ret i64 %new +} + +define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_param_2]; +; SM90-NEXT: atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_param_2]; +; SM90-NEXT: atom.release.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_param_2]; +; SM90-NEXT: atom.release.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release monotonic + ret i64 %new +} + +define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_param_2]; +; SM90-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_param_2]; +; SM90-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_param_2]; +; SM90-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release acquire + ret i64 %new +} + +define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_param_2]; +; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release seq_cst + ret i64 %new +} + +define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_param_2]; +; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release seq_cst + ret i64 %new +} + +define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_param_2]; +; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release seq_cst + ret i64 %new +} + +define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_param_2]; +; SM90-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel monotonic + ret i64 %new +} + +define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_param_2]; +; SM90-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel monotonic + ret i64 %new +} + +define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_param_2]; +; SM90-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel monotonic + ret i64 %new +} + +define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_param_2]; +; SM90-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel acquire + ret i64 %new +} + +define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_param_2]; +; SM90-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel acquire + ret i64 %new +} + +define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_param_2]; +; SM90-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel acquire + ret i64 %new +} + +define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_param_2]; +; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel seq_cst + ret i64 %new +} + +define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_param_2]; +; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel seq_cst + ret i64 %new +} + +define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_param_2]; +; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel seq_cst + ret i64 %new +} + +define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_param_2]; +; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst monotonic + ret i64 %new +} + +define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_param_2]; +; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst monotonic + ret i64 %new +} + +define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_param_2]; +; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst monotonic + ret i64 %new +} + +define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_param_2]; +; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst acquire + ret i64 %new +} + +define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_param_2]; +; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst acquire + ret i64 %new +} + +define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_param_2]; +; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst acquire + ret i64 %new +} + +define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_param_2]; +; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst + ret i64 %new +} + +define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_param_2]; +; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst seq_cst + ret i64 %new +} + +define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_param_2]; +; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst seq_cst + ret i64 %new +} + diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.ll b/llvm/test/CodeGen/NVPTX/cmpxchg.ll index 33a1f15c6a5cd..aaea0d2ee25ef 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg.ll @@ -7,6 +7,7 @@ ; TODO: these are system scope, but are compiled to gpu scope.. ; TODO: these are seq_cst, but are compiled to relaxed.. + ; CHECK-LABEL: relaxed_sys_i8 define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-LABEL: relaxed_sys_i8( @@ -17,86 +18,1153 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.u8 %rs1, [relaxed_sys_i8_param_2]; -; SM30-NEXT: ld.param.u64 %rd2, [relaxed_sys_i8_param_0]; +; SM30-NEXT: ld.param.u8 %rs1, [relaxed_sys_i8_param_2]; +; SM30-NEXT: ld.param.u64 %rd2, [relaxed_sys_i8_param_0]; +; SM30-NEXT: and.b64 %rd1, %rd2, -4; +; SM30-NEXT: cvt.u32.u64 %r9, %rd2; +; SM30-NEXT: and.b32 %r10, %r9, 3; +; SM30-NEXT: shl.b32 %r1, %r10, 3; +; SM30-NEXT: mov.b32 %r11, 255; +; SM30-NEXT: shl.b32 %r12, %r11, %r1; +; SM30-NEXT: not.b32 %r2, %r12; +; SM30-NEXT: cvt.u32.u16 %r13, %rs1; +; SM30-NEXT: and.b32 %r14, %r13, 255; +; SM30-NEXT: shl.b32 %r3, %r14, %r1; +; SM30-NEXT: ld.param.u8 %r15, [relaxed_sys_i8_param_1]; +; SM30-NEXT: shl.b32 %r4, %r15, %r1; +; SM30-NEXT: ld.u32 %r16, [%rd1]; +; SM30-NEXT: and.b32 %r20, %r16, %r2; +; SM30-NEXT: $L__BB0_1: // %partword.cmpxchg.loop +; SM30-NEXT: // =>This Inner Loop Header: Depth=1 +; SM30-NEXT: or.b32 %r17, %r20, %r3; +; SM30-NEXT: or.b32 %r18, %r20, %r4; +; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM30-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM30-NEXT: @%p1 bra $L__BB0_3; +; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM30-NEXT: // in Loop: Header=BB0_1 Depth=1 +; SM30-NEXT: and.b32 %r8, %r7, %r2; +; SM30-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM30-NEXT: mov.u32 %r20, %r8; +; SM30-NEXT: @%p2 bra $L__BB0_1; +; SM30-NEXT: $L__BB0_3: // %partword.cmpxchg.end +; SM30-NEXT: st.param.b32 [func_retval0], %r13; +; SM30-NEXT: ret; +; +; SM70-LABEL: relaxed_sys_i8( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [relaxed_sys_i8_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [relaxed_sys_i8_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [relaxed_sys_i8_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB0_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB0_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB0_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB0_1; +; SM70-NEXT: $L__BB0_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; +; SM90-LABEL: relaxed_sys_i8( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [relaxed_sys_i8_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [relaxed_sys_i8_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [relaxed_sys_i8_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB0_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB0_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB0_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB0_1; +; SM90-NEXT: $L__BB0_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic + ret i8 %new +} + +define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) { +; SM30-LABEL: acquire_sys_i8( +; SM30: { +; SM30-NEXT: .reg .pred %p<3>; +; SM30-NEXT: .reg .b16 %rs<2>; +; SM30-NEXT: .reg .b32 %r<21>; +; SM30-NEXT: .reg .b64 %rd<3>; +; SM30-EMPTY: +; SM30-NEXT: // %bb.0: +; SM30-NEXT: ld.param.u8 %rs1, [acquire_sys_i8_param_2]; +; SM30-NEXT: ld.param.u64 %rd2, [acquire_sys_i8_param_0]; +; SM30-NEXT: and.b64 %rd1, %rd2, -4; +; SM30-NEXT: cvt.u32.u64 %r9, %rd2; +; SM30-NEXT: and.b32 %r10, %r9, 3; +; SM30-NEXT: shl.b32 %r1, %r10, 3; +; SM30-NEXT: mov.b32 %r11, 255; +; SM30-NEXT: shl.b32 %r12, %r11, %r1; +; SM30-NEXT: not.b32 %r2, %r12; +; SM30-NEXT: cvt.u32.u16 %r13, %rs1; +; SM30-NEXT: and.b32 %r14, %r13, 255; +; SM30-NEXT: shl.b32 %r3, %r14, %r1; +; SM30-NEXT: ld.param.u8 %r15, [acquire_sys_i8_param_1]; +; SM30-NEXT: shl.b32 %r4, %r15, %r1; +; SM30-NEXT: ld.u32 %r16, [%rd1]; +; SM30-NEXT: and.b32 %r20, %r16, %r2; +; SM30-NEXT: $L__BB1_1: // %partword.cmpxchg.loop +; SM30-NEXT: // =>This Inner Loop Header: Depth=1 +; SM30-NEXT: or.b32 %r17, %r20, %r3; +; SM30-NEXT: or.b32 %r18, %r20, %r4; +; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM30-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM30-NEXT: @%p1 bra $L__BB1_3; +; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM30-NEXT: // in Loop: Header=BB1_1 Depth=1 +; SM30-NEXT: and.b32 %r8, %r7, %r2; +; SM30-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM30-NEXT: mov.u32 %r20, %r8; +; SM30-NEXT: @%p2 bra $L__BB1_1; +; SM30-NEXT: $L__BB1_3: // %partword.cmpxchg.end +; SM30-NEXT: membar.sys; +; SM30-NEXT: st.param.b32 [func_retval0], %r13; +; SM30-NEXT: ret; +; +; SM70-LABEL: acquire_sys_i8( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acquire_sys_i8_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_sys_i8_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acquire_sys_i8_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB1_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB1_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB1_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB1_1; +; SM70-NEXT: $L__BB1_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; +; SM90-LABEL: acquire_sys_i8( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acquire_sys_i8_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_sys_i8_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acquire_sys_i8_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB1_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB1_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB1_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB1_1; +; SM90-NEXT: $L__BB1_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire + ret i8 %new +} + +define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) { +; SM30-LABEL: release_sys_i8( +; SM30: { +; SM30-NEXT: .reg .pred %p<3>; +; SM30-NEXT: .reg .b16 %rs<2>; +; SM30-NEXT: .reg .b32 %r<21>; +; SM30-NEXT: .reg .b64 %rd<3>; +; SM30-EMPTY: +; SM30-NEXT: // %bb.0: +; SM30-NEXT: ld.param.u8 %rs1, [release_sys_i8_param_2]; +; SM30-NEXT: ld.param.u64 %rd2, [release_sys_i8_param_0]; +; SM30-NEXT: membar.sys; +; SM30-NEXT: and.b64 %rd1, %rd2, -4; +; SM30-NEXT: cvt.u32.u64 %r9, %rd2; +; SM30-NEXT: and.b32 %r10, %r9, 3; +; SM30-NEXT: shl.b32 %r1, %r10, 3; +; SM30-NEXT: mov.b32 %r11, 255; +; SM30-NEXT: shl.b32 %r12, %r11, %r1; +; SM30-NEXT: not.b32 %r2, %r12; +; SM30-NEXT: cvt.u32.u16 %r13, %rs1; +; SM30-NEXT: and.b32 %r14, %r13, 255; +; SM30-NEXT: shl.b32 %r3, %r14, %r1; +; SM30-NEXT: ld.param.u8 %r15, [release_sys_i8_param_1]; +; SM30-NEXT: shl.b32 %r4, %r15, %r1; +; SM30-NEXT: ld.u32 %r16, [%rd1]; +; SM30-NEXT: and.b32 %r20, %r16, %r2; +; SM30-NEXT: $L__BB2_1: // %partword.cmpxchg.loop +; SM30-NEXT: // =>This Inner Loop Header: Depth=1 +; SM30-NEXT: or.b32 %r17, %r20, %r3; +; SM30-NEXT: or.b32 %r18, %r20, %r4; +; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM30-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM30-NEXT: @%p1 bra $L__BB2_3; +; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM30-NEXT: // in Loop: Header=BB2_1 Depth=1 +; SM30-NEXT: and.b32 %r8, %r7, %r2; +; SM30-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM30-NEXT: mov.u32 %r20, %r8; +; SM30-NEXT: @%p2 bra $L__BB2_1; +; SM30-NEXT: $L__BB2_3: // %partword.cmpxchg.end +; SM30-NEXT: st.param.b32 [func_retval0], %r13; +; SM30-NEXT: ret; +; +; SM70-LABEL: release_sys_i8( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [release_sys_i8_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_sys_i8_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [release_sys_i8_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB2_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB2_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB2_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB2_1; +; SM70-NEXT: $L__BB2_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; +; SM90-LABEL: release_sys_i8( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_sys_i8_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_sys_i8_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_sys_i8_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB2_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB2_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB2_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB2_1; +; SM90-NEXT: $L__BB2_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic + ret i8 %new +} + +define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) { +; SM30-LABEL: acq_rel_sys_i8( +; SM30: { +; SM30-NEXT: .reg .pred %p<3>; +; SM30-NEXT: .reg .b16 %rs<2>; +; SM30-NEXT: .reg .b32 %r<21>; +; SM30-NEXT: .reg .b64 %rd<3>; +; SM30-EMPTY: +; SM30-NEXT: // %bb.0: +; SM30-NEXT: ld.param.u8 %rs1, [acq_rel_sys_i8_param_2]; +; SM30-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i8_param_0]; +; SM30-NEXT: membar.sys; +; SM30-NEXT: and.b64 %rd1, %rd2, -4; +; SM30-NEXT: cvt.u32.u64 %r9, %rd2; +; SM30-NEXT: and.b32 %r10, %r9, 3; +; SM30-NEXT: shl.b32 %r1, %r10, 3; +; SM30-NEXT: mov.b32 %r11, 255; +; SM30-NEXT: shl.b32 %r12, %r11, %r1; +; SM30-NEXT: not.b32 %r2, %r12; +; SM30-NEXT: cvt.u32.u16 %r13, %rs1; +; SM30-NEXT: and.b32 %r14, %r13, 255; +; SM30-NEXT: shl.b32 %r3, %r14, %r1; +; SM30-NEXT: ld.param.u8 %r15, [acq_rel_sys_i8_param_1]; +; SM30-NEXT: shl.b32 %r4, %r15, %r1; +; SM30-NEXT: ld.u32 %r16, [%rd1]; +; SM30-NEXT: and.b32 %r20, %r16, %r2; +; SM30-NEXT: $L__BB3_1: // %partword.cmpxchg.loop +; SM30-NEXT: // =>This Inner Loop Header: Depth=1 +; SM30-NEXT: or.b32 %r17, %r20, %r3; +; SM30-NEXT: or.b32 %r18, %r20, %r4; +; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM30-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM30-NEXT: @%p1 bra $L__BB3_3; +; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM30-NEXT: // in Loop: Header=BB3_1 Depth=1 +; SM30-NEXT: and.b32 %r8, %r7, %r2; +; SM30-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM30-NEXT: mov.u32 %r20, %r8; +; SM30-NEXT: @%p2 bra $L__BB3_1; +; SM30-NEXT: $L__BB3_3: // %partword.cmpxchg.end +; SM30-NEXT: membar.sys; +; SM30-NEXT: st.param.b32 [func_retval0], %r13; +; SM30-NEXT: ret; +; +; SM70-LABEL: acq_rel_sys_i8( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_sys_i8_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i8_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_sys_i8_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB3_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB3_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB3_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB3_1; +; SM70-NEXT: $L__BB3_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; +; SM90-LABEL: acq_rel_sys_i8( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_sys_i8_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i8_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_sys_i8_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB3_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB3_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB3_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB3_1; +; SM90-NEXT: $L__BB3_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire + ret i8 %new +} + +define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) { +; SM30-LABEL: seq_cst_sys_i8( +; SM30: { +; SM30-NEXT: .reg .pred %p<3>; +; SM30-NEXT: .reg .b16 %rs<2>; +; SM30-NEXT: .reg .b32 %r<21>; +; SM30-NEXT: .reg .b64 %rd<3>; +; SM30-EMPTY: +; SM30-NEXT: // %bb.0: +; SM30-NEXT: ld.param.u8 %rs1, [seq_cst_sys_i8_param_2]; +; SM30-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i8_param_0]; +; SM30-NEXT: membar.sys; +; SM30-NEXT: and.b64 %rd1, %rd2, -4; +; SM30-NEXT: cvt.u32.u64 %r9, %rd2; +; SM30-NEXT: and.b32 %r10, %r9, 3; +; SM30-NEXT: shl.b32 %r1, %r10, 3; +; SM30-NEXT: mov.b32 %r11, 255; +; SM30-NEXT: shl.b32 %r12, %r11, %r1; +; SM30-NEXT: not.b32 %r2, %r12; +; SM30-NEXT: cvt.u32.u16 %r13, %rs1; +; SM30-NEXT: and.b32 %r14, %r13, 255; +; SM30-NEXT: shl.b32 %r3, %r14, %r1; +; SM30-NEXT: ld.param.u8 %r15, [seq_cst_sys_i8_param_1]; +; SM30-NEXT: shl.b32 %r4, %r15, %r1; +; SM30-NEXT: ld.u32 %r16, [%rd1]; +; SM30-NEXT: and.b32 %r20, %r16, %r2; +; SM30-NEXT: $L__BB4_1: // %partword.cmpxchg.loop +; SM30-NEXT: // =>This Inner Loop Header: Depth=1 +; SM30-NEXT: or.b32 %r17, %r20, %r3; +; SM30-NEXT: or.b32 %r18, %r20, %r4; +; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM30-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM30-NEXT: @%p1 bra $L__BB4_3; +; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM30-NEXT: // in Loop: Header=BB4_1 Depth=1 +; SM30-NEXT: and.b32 %r8, %r7, %r2; +; SM30-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM30-NEXT: mov.u32 %r20, %r8; +; SM30-NEXT: @%p2 bra $L__BB4_1; +; SM30-NEXT: $L__BB4_3: // %partword.cmpxchg.end +; SM30-NEXT: membar.sys; +; SM30-NEXT: st.param.b32 [func_retval0], %r13; +; SM30-NEXT: ret; +; +; SM70-LABEL: seq_cst_sys_i8( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_sys_i8_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i8_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_sys_i8_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB4_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB4_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB4_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB4_1; +; SM70-NEXT: $L__BB4_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; +; SM90-LABEL: seq_cst_sys_i8( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_sys_i8_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i8_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_sys_i8_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB4_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB4_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB4_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB4_1; +; SM90-NEXT: $L__BB4_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst + ret i8 %new +} + +; CHECK-LABEL: relaxed_sys_i16 +define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) { +; SM30-LABEL: relaxed_sys_i16( +; SM30: { +; SM30-NEXT: .reg .pred %p<3>; +; SM30-NEXT: .reg .b16 %rs<2>; +; SM30-NEXT: .reg .b32 %r<20>; +; SM30-NEXT: .reg .b64 %rd<3>; +; SM30-EMPTY: +; SM30-NEXT: // %bb.0: +; SM30-NEXT: ld.param.u16 %rs1, [relaxed_sys_i16_param_2]; +; SM30-NEXT: ld.param.u64 %rd2, [relaxed_sys_i16_param_0]; +; SM30-NEXT: and.b64 %rd1, %rd2, -4; +; SM30-NEXT: ld.param.u16 %r9, [relaxed_sys_i16_param_1]; +; SM30-NEXT: cvt.u32.u64 %r10, %rd2; +; SM30-NEXT: and.b32 %r11, %r10, 3; +; SM30-NEXT: shl.b32 %r1, %r11, 3; +; SM30-NEXT: mov.b32 %r12, 65535; +; SM30-NEXT: shl.b32 %r13, %r12, %r1; +; SM30-NEXT: not.b32 %r2, %r13; +; SM30-NEXT: cvt.u32.u16 %r14, %rs1; +; SM30-NEXT: shl.b32 %r3, %r14, %r1; +; SM30-NEXT: shl.b32 %r4, %r9, %r1; +; SM30-NEXT: ld.u32 %r15, [%rd1]; +; SM30-NEXT: and.b32 %r19, %r15, %r2; +; SM30-NEXT: $L__BB5_1: // %partword.cmpxchg.loop +; SM30-NEXT: // =>This Inner Loop Header: Depth=1 +; SM30-NEXT: or.b32 %r16, %r19, %r3; +; SM30-NEXT: or.b32 %r17, %r19, %r4; +; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM30-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM30-NEXT: @%p1 bra $L__BB5_3; +; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM30-NEXT: // in Loop: Header=BB5_1 Depth=1 +; SM30-NEXT: and.b32 %r8, %r7, %r2; +; SM30-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM30-NEXT: mov.u32 %r19, %r8; +; SM30-NEXT: @%p2 bra $L__BB5_1; +; SM30-NEXT: $L__BB5_3: // %partword.cmpxchg.end +; SM30-NEXT: st.param.b32 [func_retval0], %r14; +; SM30-NEXT: ret; +; +; SM70-LABEL: relaxed_sys_i16( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [relaxed_sys_i16_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [relaxed_sys_i16_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [relaxed_sys_i16_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB5_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB5_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB5_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB5_1; +; SM70-NEXT: $L__BB5_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; +; SM90-LABEL: relaxed_sys_i16( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [relaxed_sys_i16_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [relaxed_sys_i16_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [relaxed_sys_i16_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB5_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB5_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB5_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB5_1; +; SM90-NEXT: $L__BB5_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic + ret i16 %new +} + +define i16 @acquire_sys_i16(ptr %addr, i16 %cmp, i16 %new) { +; SM30-LABEL: acquire_sys_i16( +; SM30: { +; SM30-NEXT: .reg .pred %p<3>; +; SM30-NEXT: .reg .b16 %rs<2>; +; SM30-NEXT: .reg .b32 %r<20>; +; SM30-NEXT: .reg .b64 %rd<3>; +; SM30-EMPTY: +; SM30-NEXT: // %bb.0: +; SM30-NEXT: ld.param.u16 %rs1, [acquire_sys_i16_param_2]; +; SM30-NEXT: ld.param.u64 %rd2, [acquire_sys_i16_param_0]; +; SM30-NEXT: and.b64 %rd1, %rd2, -4; +; SM30-NEXT: ld.param.u16 %r9, [acquire_sys_i16_param_1]; +; SM30-NEXT: cvt.u32.u64 %r10, %rd2; +; SM30-NEXT: and.b32 %r11, %r10, 3; +; SM30-NEXT: shl.b32 %r1, %r11, 3; +; SM30-NEXT: mov.b32 %r12, 65535; +; SM30-NEXT: shl.b32 %r13, %r12, %r1; +; SM30-NEXT: not.b32 %r2, %r13; +; SM30-NEXT: cvt.u32.u16 %r14, %rs1; +; SM30-NEXT: shl.b32 %r3, %r14, %r1; +; SM30-NEXT: shl.b32 %r4, %r9, %r1; +; SM30-NEXT: ld.u32 %r15, [%rd1]; +; SM30-NEXT: and.b32 %r19, %r15, %r2; +; SM30-NEXT: $L__BB6_1: // %partword.cmpxchg.loop +; SM30-NEXT: // =>This Inner Loop Header: Depth=1 +; SM30-NEXT: or.b32 %r16, %r19, %r3; +; SM30-NEXT: or.b32 %r17, %r19, %r4; +; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM30-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM30-NEXT: @%p1 bra $L__BB6_3; +; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM30-NEXT: // in Loop: Header=BB6_1 Depth=1 +; SM30-NEXT: and.b32 %r8, %r7, %r2; +; SM30-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM30-NEXT: mov.u32 %r19, %r8; +; SM30-NEXT: @%p2 bra $L__BB6_1; +; SM30-NEXT: $L__BB6_3: // %partword.cmpxchg.end +; SM30-NEXT: membar.sys; +; SM30-NEXT: st.param.b32 [func_retval0], %r14; +; SM30-NEXT: ret; +; +; SM70-LABEL: acquire_sys_i16( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acquire_sys_i16_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_sys_i16_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_sys_i16_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB6_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB6_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB6_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB6_1; +; SM70-NEXT: $L__BB6_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; +; SM90-LABEL: acquire_sys_i16( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_sys_i16_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_sys_i16_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_sys_i16_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB6_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB6_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB6_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB6_1; +; SM90-NEXT: $L__BB6_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire acquire + ret i16 %new +} + +define i16 @release_sys_i16(ptr %addr, i16 %cmp, i16 %new) { +; SM30-LABEL: release_sys_i16( +; SM30: { +; SM30-NEXT: .reg .pred %p<3>; +; SM30-NEXT: .reg .b16 %rs<2>; +; SM30-NEXT: .reg .b32 %r<20>; +; SM30-NEXT: .reg .b64 %rd<3>; +; SM30-EMPTY: +; SM30-NEXT: // %bb.0: +; SM30-NEXT: ld.param.u16 %rs1, [release_sys_i16_param_2]; +; SM30-NEXT: ld.param.u64 %rd2, [release_sys_i16_param_0]; +; SM30-NEXT: membar.sys; +; SM30-NEXT: ld.param.u16 %r9, [release_sys_i16_param_1]; +; SM30-NEXT: and.b64 %rd1, %rd2, -4; +; SM30-NEXT: cvt.u32.u64 %r10, %rd2; +; SM30-NEXT: and.b32 %r11, %r10, 3; +; SM30-NEXT: shl.b32 %r1, %r11, 3; +; SM30-NEXT: mov.b32 %r12, 65535; +; SM30-NEXT: shl.b32 %r13, %r12, %r1; +; SM30-NEXT: not.b32 %r2, %r13; +; SM30-NEXT: cvt.u32.u16 %r14, %rs1; +; SM30-NEXT: shl.b32 %r3, %r14, %r1; +; SM30-NEXT: shl.b32 %r4, %r9, %r1; +; SM30-NEXT: ld.u32 %r15, [%rd1]; +; SM30-NEXT: and.b32 %r19, %r15, %r2; +; SM30-NEXT: $L__BB7_1: // %partword.cmpxchg.loop +; SM30-NEXT: // =>This Inner Loop Header: Depth=1 +; SM30-NEXT: or.b32 %r16, %r19, %r3; +; SM30-NEXT: or.b32 %r17, %r19, %r4; +; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM30-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM30-NEXT: @%p1 bra $L__BB7_3; +; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM30-NEXT: // in Loop: Header=BB7_1 Depth=1 +; SM30-NEXT: and.b32 %r8, %r7, %r2; +; SM30-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM30-NEXT: mov.u32 %r19, %r8; +; SM30-NEXT: @%p2 bra $L__BB7_1; +; SM30-NEXT: $L__BB7_3: // %partword.cmpxchg.end +; SM30-NEXT: st.param.b32 [func_retval0], %r14; +; SM30-NEXT: ret; +; +; SM70-LABEL: release_sys_i16( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [release_sys_i16_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_sys_i16_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [release_sys_i16_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB7_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB7_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB7_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB7_1; +; SM70-NEXT: $L__BB7_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; +; SM90-LABEL: release_sys_i16( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_sys_i16_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_sys_i16_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [release_sys_i16_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB7_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB7_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB7_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB7_1; +; SM90-NEXT: $L__BB7_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release monotonic + ret i16 %new +} + +define i16 @acq_rel_sys_i16(ptr %addr, i16 %cmp, i16 %new) { +; SM30-LABEL: acq_rel_sys_i16( +; SM30: { +; SM30-NEXT: .reg .pred %p<3>; +; SM30-NEXT: .reg .b16 %rs<2>; +; SM30-NEXT: .reg .b32 %r<20>; +; SM30-NEXT: .reg .b64 %rd<3>; +; SM30-EMPTY: +; SM30-NEXT: // %bb.0: +; SM30-NEXT: ld.param.u16 %rs1, [acq_rel_sys_i16_param_2]; +; SM30-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i16_param_0]; +; SM30-NEXT: membar.sys; +; SM30-NEXT: ld.param.u16 %r9, [acq_rel_sys_i16_param_1]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; -; SM30-NEXT: cvt.u32.u64 %r9, %rd2; -; SM30-NEXT: and.b32 %r10, %r9, 3; -; SM30-NEXT: shl.b32 %r1, %r10, 3; -; SM30-NEXT: mov.b32 %r11, 255; -; SM30-NEXT: shl.b32 %r12, %r11, %r1; -; SM30-NEXT: not.b32 %r2, %r12; -; SM30-NEXT: cvt.u32.u16 %r13, %rs1; -; SM30-NEXT: and.b32 %r14, %r13, 255; +; SM30-NEXT: cvt.u32.u64 %r10, %rd2; +; SM30-NEXT: and.b32 %r11, %r10, 3; +; SM30-NEXT: shl.b32 %r1, %r11, 3; +; SM30-NEXT: mov.b32 %r12, 65535; +; SM30-NEXT: shl.b32 %r13, %r12, %r1; +; SM30-NEXT: not.b32 %r2, %r13; +; SM30-NEXT: cvt.u32.u16 %r14, %rs1; ; SM30-NEXT: shl.b32 %r3, %r14, %r1; -; SM30-NEXT: ld.param.u8 %r15, [relaxed_sys_i8_param_1]; -; SM30-NEXT: shl.b32 %r4, %r15, %r1; -; SM30-NEXT: ld.u32 %r16, [%rd1]; -; SM30-NEXT: and.b32 %r20, %r16, %r2; -; SM30-NEXT: $L__BB0_1: // %partword.cmpxchg.loop +; SM30-NEXT: shl.b32 %r4, %r9, %r1; +; SM30-NEXT: ld.u32 %r15, [%rd1]; +; SM30-NEXT: and.b32 %r19, %r15, %r2; +; SM30-NEXT: $L__BB8_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 -; SM30-NEXT: or.b32 %r17, %r20, %r3; -; SM30-NEXT: or.b32 %r18, %r20, %r4; -; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; -; SM30-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM30-NEXT: @%p1 bra $L__BB0_3; +; SM30-NEXT: or.b32 %r16, %r19, %r3; +; SM30-NEXT: or.b32 %r17, %r19, %r4; +; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM30-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM30-NEXT: @%p1 bra $L__BB8_3; ; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM30-NEXT: // in Loop: Header=BB0_1 Depth=1 +; SM30-NEXT: // in Loop: Header=BB8_1 Depth=1 ; SM30-NEXT: and.b32 %r8, %r7, %r2; -; SM30-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM30-NEXT: mov.u32 %r20, %r8; -; SM30-NEXT: @%p2 bra $L__BB0_1; -; SM30-NEXT: $L__BB0_3: // %partword.cmpxchg.end -; SM30-NEXT: st.param.b32 [func_retval0], %r13; +; SM30-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM30-NEXT: mov.u32 %r19, %r8; +; SM30-NEXT: @%p2 bra $L__BB8_1; +; SM30-NEXT: $L__BB8_3: // %partword.cmpxchg.end +; SM30-NEXT: membar.sys; +; SM30-NEXT: st.param.b32 [func_retval0], %r14; ; SM30-NEXT: ret; ; -; SM70-LABEL: relaxed_sys_i8( +; SM70-LABEL: acq_rel_sys_i16( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<20>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [relaxed_sys_i8_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [relaxed_sys_i8_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_sys_i16_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i16_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_sys_i16_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [relaxed_sys_i8_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB0_1: // %partword.cmpxchg.loop +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB8_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB0_3; +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB8_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB0_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB8_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.u32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB0_1; -; SM70-NEXT: $L__BB0_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB8_1; +; SM70-NEXT: $L__BB8_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic - ret i8 %new +; SM90-LABEL: acq_rel_sys_i16( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_sys_i16_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i16_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_sys_i16_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB8_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB8_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB8_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB8_1; +; SM90-NEXT: $L__BB8_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel acquire + ret i16 %new } -; CHECK-LABEL: relaxed_sys_i16 -define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) { -; SM30-LABEL: relaxed_sys_i16( +; CHECK-LABEL: seq_cst_sys_i16 +define i16 @seq_cst_sys_i16(ptr %addr, i16 %cmp, i16 %new) { +; SM30-LABEL: seq_cst_sys_i16( ; SM30: { ; SM30-NEXT: .reg .pred %p<3>; ; SM30-NEXT: .reg .b16 %rs<2>; @@ -104,10 +1172,11 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.u16 %rs1, [relaxed_sys_i16_param_2]; -; SM30-NEXT: ld.param.u64 %rd2, [relaxed_sys_i16_param_0]; +; SM30-NEXT: ld.param.u16 %rs1, [seq_cst_sys_i16_param_2]; +; SM30-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i16_param_0]; +; SM30-NEXT: membar.sys; +; SM30-NEXT: ld.param.u16 %r9, [seq_cst_sys_i16_param_1]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; -; SM30-NEXT: ld.param.u16 %r9, [relaxed_sys_i16_param_1]; ; SM30-NEXT: cvt.u32.u64 %r10, %rd2; ; SM30-NEXT: and.b32 %r11, %r10, 3; ; SM30-NEXT: shl.b32 %r1, %r11, 3; @@ -119,24 +1188,25 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM30-NEXT: shl.b32 %r4, %r9, %r1; ; SM30-NEXT: ld.u32 %r15, [%rd1]; ; SM30-NEXT: and.b32 %r19, %r15, %r2; -; SM30-NEXT: $L__BB1_1: // %partword.cmpxchg.loop +; SM30-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 ; SM30-NEXT: or.b32 %r16, %r19, %r3; ; SM30-NEXT: or.b32 %r17, %r19, %r4; ; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM30-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM30-NEXT: @%p1 bra $L__BB1_3; +; SM30-NEXT: @%p1 bra $L__BB9_3; ; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM30-NEXT: // in Loop: Header=BB1_1 Depth=1 +; SM30-NEXT: // in Loop: Header=BB9_1 Depth=1 ; SM30-NEXT: and.b32 %r8, %r7, %r2; ; SM30-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM30-NEXT: mov.u32 %r19, %r8; -; SM30-NEXT: @%p2 bra $L__BB1_1; -; SM30-NEXT: $L__BB1_3: // %partword.cmpxchg.end +; SM30-NEXT: @%p2 bra $L__BB9_1; +; SM30-NEXT: $L__BB9_3: // %partword.cmpxchg.end +; SM30-NEXT: membar.sys; ; SM30-NEXT: st.param.b32 [func_retval0], %r14; ; SM30-NEXT: ret; ; -; SM70-LABEL: relaxed_sys_i16( +; SM70-LABEL: seq_cst_sys_i16( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -144,10 +1214,11 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [relaxed_sys_i16_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [relaxed_sys_i16_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_sys_i16_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i16_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_sys_i16_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [relaxed_sys_i16_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -159,23 +1230,65 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB1_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB1_3; +; SM70-NEXT: @%p1 bra $L__BB9_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB1_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB9_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.u32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB1_1; -; SM70-NEXT: $L__BB1_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB9_1; +; SM70-NEXT: $L__BB9_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic +; SM90-LABEL: seq_cst_sys_i16( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_sys_i16_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i16_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_sys_i16_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB9_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB9_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB9_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB9_1; +; SM90-NEXT: $L__BB9_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst ret i16 %new } @@ -203,13 +1316,197 @@ define i32 @relaxed_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: ld.param.u64 %rd1, [relaxed_sys_i32_param_0]; ; SM70-NEXT: ld.param.u32 %r1, [relaxed_sys_i32_param_1]; ; SM70-NEXT: ld.param.u32 %r2, [relaxed_sys_i32_param_2]; -; SM70-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: atom.relaxed.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; +; SM90-LABEL: relaxed_sys_i32( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [relaxed_sys_i32_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [relaxed_sys_i32_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [relaxed_sys_i32_param_2]; +; SM90-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic ret i32 %new } +define i32 @acq_rel_sys_i32(ptr %addr, i32 %cmp, i32 %new) { +; SM30-LABEL: acq_rel_sys_i32( +; SM30: { +; SM30-NEXT: .reg .b32 %r<4>; +; SM30-NEXT: .reg .b64 %rd<2>; +; SM30-EMPTY: +; SM30-NEXT: // %bb.0: +; SM30-NEXT: ld.param.u64 %rd1, [acq_rel_sys_i32_param_0]; +; SM30-NEXT: ld.param.u32 %r1, [acq_rel_sys_i32_param_1]; +; SM30-NEXT: ld.param.u32 %r2, [acq_rel_sys_i32_param_2]; +; SM30-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM30-NEXT: st.param.b32 [func_retval0], %r2; +; SM30-NEXT: ret; +; +; SM70-LABEL: acq_rel_sys_i32( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_sys_i32_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_sys_i32_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_sys_i32_param_2]; +; SM70-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; +; SM90-LABEL: acq_rel_sys_i32( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_sys_i32_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_sys_i32_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_sys_i32_param_2]; +; SM90-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel acquire + ret i32 %new +} + +define i32 @acquire_sys_i32(ptr %addr, i32 %cmp, i32 %new) { +; SM30-LABEL: acquire_sys_i32( +; SM30: { +; SM30-NEXT: .reg .b32 %r<4>; +; SM30-NEXT: .reg .b64 %rd<2>; +; SM30-EMPTY: +; SM30-NEXT: // %bb.0: +; SM30-NEXT: ld.param.u64 %rd1, [acquire_sys_i32_param_0]; +; SM30-NEXT: ld.param.u32 %r1, [acquire_sys_i32_param_1]; +; SM30-NEXT: ld.param.u32 %r2, [acquire_sys_i32_param_2]; +; SM30-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM30-NEXT: st.param.b32 [func_retval0], %r2; +; SM30-NEXT: ret; +; +; SM70-LABEL: acquire_sys_i32( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_sys_i32_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_sys_i32_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_sys_i32_param_2]; +; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; +; SM90-LABEL: acquire_sys_i32( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_sys_i32_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_sys_i32_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_sys_i32_param_2]; +; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire acquire + ret i32 %new +} + +define i32 @release_sys_i32(ptr %addr, i32 %cmp, i32 %new) { +; SM30-LABEL: release_sys_i32( +; SM30: { +; SM30-NEXT: .reg .b32 %r<4>; +; SM30-NEXT: .reg .b64 %rd<2>; +; SM30-EMPTY: +; SM30-NEXT: // %bb.0: +; SM30-NEXT: ld.param.u64 %rd1, [release_sys_i32_param_0]; +; SM30-NEXT: ld.param.u32 %r1, [release_sys_i32_param_1]; +; SM30-NEXT: ld.param.u32 %r2, [release_sys_i32_param_2]; +; SM30-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM30-NEXT: st.param.b32 [func_retval0], %r2; +; SM30-NEXT: ret; +; +; SM70-LABEL: release_sys_i32( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_sys_i32_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_sys_i32_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_sys_i32_param_2]; +; SM70-NEXT: atom.release.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; +; SM90-LABEL: release_sys_i32( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_sys_i32_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_sys_i32_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_sys_i32_param_2]; +; SM90-NEXT: atom.release.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release monotonic + ret i32 %new +} + +define i32 @seq_cst_sys_i32(ptr %addr, i32 %cmp, i32 %new) { +; SM30-LABEL: seq_cst_sys_i32( +; SM30: { +; SM30-NEXT: .reg .b32 %r<4>; +; SM30-NEXT: .reg .b64 %rd<2>; +; SM30-EMPTY: +; SM30-NEXT: // %bb.0: +; SM30-NEXT: ld.param.u64 %rd1, [seq_cst_sys_i32_param_0]; +; SM30-NEXT: membar.sys; +; SM30-NEXT: ld.param.u32 %r1, [seq_cst_sys_i32_param_1]; +; SM30-NEXT: ld.param.u32 %r2, [seq_cst_sys_i32_param_2]; +; SM30-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM30-NEXT: st.param.b32 [func_retval0], %r2; +; SM30-NEXT: ret; +; +; SM70-LABEL: seq_cst_sys_i32( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_sys_i32_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_sys_i32_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_sys_i32_param_2]; +; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; +; SM90-LABEL: seq_cst_sys_i32( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_sys_i32_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_sys_i32_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_sys_i32_param_2]; +; SM90-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst + ret i32 %new +} + ; CHECK-LABEL: relaxed_sys_i64 define i64 @relaxed_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM30-LABEL: relaxed_sys_i64( @@ -232,11 +1529,183 @@ define i64 @relaxed_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: ld.param.u64 %rd1, [relaxed_sys_i64_param_0]; ; SM70-NEXT: ld.param.u64 %rd2, [relaxed_sys_i64_param_1]; ; SM70-NEXT: ld.param.u64 %rd3, [relaxed_sys_i64_param_2]; -; SM70-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: atom.relaxed.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; +; SM90-LABEL: relaxed_sys_i64( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [relaxed_sys_i64_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [relaxed_sys_i64_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [relaxed_sys_i64_param_2]; +; SM90-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic ret i64 %new } + +define i64 @acquire_sys_i64(ptr %addr, i64 %cmp, i64 %new) { +; SM30-LABEL: acquire_sys_i64( +; SM30: { +; SM30-NEXT: .reg .b64 %rd<5>; +; SM30-EMPTY: +; SM30-NEXT: // %bb.0: +; SM30-NEXT: ld.param.u64 %rd1, [acquire_sys_i64_param_0]; +; SM30-NEXT: ld.param.u64 %rd2, [acquire_sys_i64_param_1]; +; SM30-NEXT: ld.param.u64 %rd3, [acquire_sys_i64_param_2]; +; SM30-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM30-NEXT: st.param.b64 [func_retval0], %rd3; +; SM30-NEXT: ret; +; +; SM70-LABEL: acquire_sys_i64( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_sys_i64_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_sys_i64_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_sys_i64_param_2]; +; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; +; SM90-LABEL: acquire_sys_i64( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_sys_i64_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_sys_i64_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_sys_i64_param_2]; +; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire acquire + ret i64 %new +} + +define i64 @acq_rel_sys_i64(ptr %addr, i64 %cmp, i64 %new) { +; SM30-LABEL: acq_rel_sys_i64( +; SM30: { +; SM30-NEXT: .reg .b64 %rd<5>; +; SM30-EMPTY: +; SM30-NEXT: // %bb.0: +; SM30-NEXT: ld.param.u64 %rd1, [acq_rel_sys_i64_param_0]; +; SM30-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i64_param_1]; +; SM30-NEXT: ld.param.u64 %rd3, [acq_rel_sys_i64_param_2]; +; SM30-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM30-NEXT: st.param.b64 [func_retval0], %rd3; +; SM30-NEXT: ret; +; +; SM70-LABEL: acq_rel_sys_i64( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_sys_i64_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i64_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_sys_i64_param_2]; +; SM70-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; +; SM90-LABEL: acq_rel_sys_i64( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_sys_i64_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i64_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_sys_i64_param_2]; +; SM90-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel acquire + ret i64 %new +} + +define i64 @release_sys_i64(ptr %addr, i64 %cmp, i64 %new) { +; SM30-LABEL: release_sys_i64( +; SM30: { +; SM30-NEXT: .reg .b64 %rd<5>; +; SM30-EMPTY: +; SM30-NEXT: // %bb.0: +; SM30-NEXT: ld.param.u64 %rd1, [release_sys_i64_param_0]; +; SM30-NEXT: ld.param.u64 %rd2, [release_sys_i64_param_1]; +; SM30-NEXT: ld.param.u64 %rd3, [release_sys_i64_param_2]; +; SM30-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM30-NEXT: st.param.b64 [func_retval0], %rd3; +; SM30-NEXT: ret; +; +; SM70-LABEL: release_sys_i64( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_sys_i64_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_sys_i64_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_sys_i64_param_2]; +; SM70-NEXT: atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; +; SM90-LABEL: release_sys_i64( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_sys_i64_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_sys_i64_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_sys_i64_param_2]; +; SM90-NEXT: atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release monotonic + ret i64 %new +} + +define i64 @seq_cst_sys_i64(ptr %addr, i64 %cmp, i64 %new) { +; SM30-LABEL: seq_cst_sys_i64( +; SM30: { +; SM30-NEXT: .reg .b64 %rd<5>; +; SM30-EMPTY: +; SM30-NEXT: // %bb.0: +; SM30-NEXT: ld.param.u64 %rd1, [seq_cst_sys_i64_param_0]; +; SM30-NEXT: membar.sys; +; SM30-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i64_param_1]; +; SM30-NEXT: ld.param.u64 %rd3, [seq_cst_sys_i64_param_2]; +; SM30-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM30-NEXT: st.param.b64 [func_retval0], %rd3; +; SM30-NEXT: ret; +; +; SM70-LABEL: seq_cst_sys_i64( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_sys_i64_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i64_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_sys_i64_param_2]; +; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; +; SM90-LABEL: seq_cst_sys_i64( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_sys_i64_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i64_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_sys_i64_param_2]; +; SM90-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst + ret i64 %new +} + ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; CHECK: {{.*}} diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.py b/llvm/test/CodeGen/NVPTX/cmpxchg.py new file mode 100644 index 0000000000000..bf787406b76c0 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cmpxchg.py @@ -0,0 +1,46 @@ +# For manual usage, not as a part of lit tests. Used for generating the following tests: +# cmpxchg-sm30.ll, cmpxchg-sm70.ll, cmpxchg-sm90.ll + +from string import Template +from itertools import product + +cmpxchg_func = Template( +"""define i$size @${success}_${failure}_i${size}_${addrspace}(ptr${addrspace_cast} %addr, i$size %cmp, i$size %new) { + %pairold = cmpxchg ptr${addrspace_cast} %addr, i$size %cmp, i$size %new $success $failure + ret i$size %new +} +""" +) + +run_statement = Template( + """; RUN: llc < %s -march=nvptx64 -mcpu=sm_${sm} -mattr=+ptx${ptx} | FileCheck %s --check-prefix=SM${sm} +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_${sm} -mattr=+ptx${ptx} | %ptxas-verify -arch=sm_${sm} %} +""" +) + +TESTS = [(60, 50), (70, 63), (90, 87)] + +LLVM_SCOPES = ["", "block", "cluster", "device"] + +SCOPE_LLVM_TO_PTX = {"": "sys", "block": "cta", "cluster": "cluster", "device": "gpu"} + +SUCCESS_ORDERINGS = ["monotonic", "acquire", "release", "acq_rel", "seq_cst"] + +FAILURE_ORDERINGS = ["monotonic", "acquire", "seq_cst"] + +SIZES = [8, 16, 32, 64] + +ADDRSPACES = [0, 1, 3] + +ADDRSPACE_NUM_TO_ADDRSPACE = {0: "generic", 1: "global", 3: "shared"} + +if __name__ == "__main__": + for sm, ptx in TESTS: + with open("cmpxchg-sm{}.ll".format(str(sm)), "w") as fp: + print(run_statement.substitute(sm=sm, ptx=ptx), file=fp) + for size, success, failure, addrspace in product(SIZES, SUCCESS_ORDERINGS, FAILURE_ORDERINGS, ADDRSPACES): + if addrspace == 0: + addrspace_cast = "" + else: + addrspace_cast = " addrspace({})".format(str(addrspace)) + print(cmpxchg_func.substitute(success=success, failure=failure, size=size, addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace], addrspace_cast=addrspace_cast), file=fp) diff --git a/llvm/test/CodeGen/NVPTX/lit.local.cfg b/llvm/test/CodeGen/NVPTX/lit.local.cfg index 54a6c338bdf85..84cce669ec10b 100644 --- a/llvm/test/CodeGen/NVPTX/lit.local.cfg +++ b/llvm/test/CodeGen/NVPTX/lit.local.cfg @@ -1,4 +1,4 @@ if not "NVPTX" in config.root.targets: config.unsupported = True config.suffixes.add(".py") -config.excludes = ["fence.py"] +config.excludes = ["fence.py", "cmpxchg.py"] From 87d17e5da7c516ae43d9396fabad0c266d3b1903 Mon Sep 17 00:00:00 2001 From: Akshay Deodhar Date: Fri, 7 Feb 2025 00:02:54 +0000 Subject: [PATCH 02/10] clang-format, black --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 19 +++++++++++-------- llvm/test/CodeGen/NVPTX/cmpxchg.py | 17 ++++++++++++++--- 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index a74eada0afceb..db5f45fb7d09c 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -5603,9 +5603,9 @@ Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder, if (isa(Inst)) { // Emit a fence.sc leading fence for cmpxchg seq_cst which are not emulated if (isReleaseOrStronger(Ord)) - return Ord == AtomicOrdering::SequentiallyConsistent ? - Builder.CreateFence(AtomicOrdering::SequentiallyConsistent) : - Builder.CreateFence(AtomicOrdering::Release); + return Ord == AtomicOrdering::SequentiallyConsistent + ? Builder.CreateFence(AtomicOrdering::SequentiallyConsistent) + : Builder.CreateFence(AtomicOrdering::Release); } else { return TargetLoweringBase::emitLeadingFence(Builder, Inst, Ord); } @@ -5617,13 +5617,16 @@ Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder, AtomicOrdering Ord) const { // Specialize for cmpxchg if (isa(Inst)) { - auto CASWidth= cast(dyn_cast(Inst)->getCompareOperand()->getType())->getBitWidth(); + auto CASWidth = + cast( + dyn_cast(Inst)->getCompareOperand()->getType()) + ->getBitWidth(); // Do not emit a trailing fence for cmpxchg seq_cst which are not emulated if (isAcquireOrStronger(Ord)) - return (Ord == AtomicOrdering::SequentiallyConsistent - && CASWidth >= STI.getMinCmpXchgSizeInBits()) ? - nullptr : - Builder.CreateFence(AtomicOrdering::Acquire); + return (Ord == AtomicOrdering::SequentiallyConsistent && + CASWidth >= STI.getMinCmpXchgSizeInBits()) + ? nullptr + : Builder.CreateFence(AtomicOrdering::Acquire); } else { return TargetLoweringBase::emitTrailingFence(Builder, Inst, Ord); } diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.py b/llvm/test/CodeGen/NVPTX/cmpxchg.py index bf787406b76c0..ae7450015ecd2 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg.py +++ b/llvm/test/CodeGen/NVPTX/cmpxchg.py @@ -5,7 +5,7 @@ from itertools import product cmpxchg_func = Template( -"""define i$size @${success}_${failure}_i${size}_${addrspace}(ptr${addrspace_cast} %addr, i$size %cmp, i$size %new) { + """define i$size @${success}_${failure}_i${size}_${addrspace}(ptr${addrspace_cast} %addr, i$size %cmp, i$size %new) { %pairold = cmpxchg ptr${addrspace_cast} %addr, i$size %cmp, i$size %new $success $failure ret i$size %new } @@ -38,9 +38,20 @@ for sm, ptx in TESTS: with open("cmpxchg-sm{}.ll".format(str(sm)), "w") as fp: print(run_statement.substitute(sm=sm, ptx=ptx), file=fp) - for size, success, failure, addrspace in product(SIZES, SUCCESS_ORDERINGS, FAILURE_ORDERINGS, ADDRSPACES): + for size, success, failure, addrspace in product( + SIZES, SUCCESS_ORDERINGS, FAILURE_ORDERINGS, ADDRSPACES + ): if addrspace == 0: addrspace_cast = "" else: addrspace_cast = " addrspace({})".format(str(addrspace)) - print(cmpxchg_func.substitute(success=success, failure=failure, size=size, addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace], addrspace_cast=addrspace_cast), file=fp) + print( + cmpxchg_func.substitute( + success=success, + failure=failure, + size=size, + addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace], + addrspace_cast=addrspace_cast, + ), + file=fp, + ) From e6949e95fca01fce6726778e32aa49b8afcd8629 Mon Sep 17 00:00:00 2001 From: Akshay Deodhar Date: Fri, 7 Feb 2025 01:37:58 +0000 Subject: [PATCH 03/10] fix tests: SM < 70 does not support memory order, emit explicit atom.cas.relaxed for monotonic --- llvm/test/CodeGen/NVPTX/atomics-sm90.ll | 8 ++++---- llvm/test/CodeGen/NVPTX/atomics.ll | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll index 67abfe8295a62..9027bd6a14780 100644 --- a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll +++ b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll @@ -71,7 +71,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat ; CHECKPTX71-NEXT: shl.b32 %r30, %r29, %r2; ; CHECKPTX71-NEXT: and.b32 %r31, %r54, %r3; ; CHECKPTX71-NEXT: or.b32 %r32, %r31, %r30; -; CHECKPTX71-NEXT: atom.cas.b32 %r6, [%r1], %r54, %r32; +; CHECKPTX71-NEXT: atom.relaxed.cas.b32 %r6, [%r1], %r54, %r32; ; CHECKPTX71-NEXT: setp.ne.s32 %p1, %r6, %r54; ; CHECKPTX71-NEXT: mov.u32 %r54, %r6; ; CHECKPTX71-NEXT: @%p1 bra $L__BB0_1; @@ -87,7 +87,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat ; CHECKPTX71-NEXT: shl.b32 %r35, %r34, %r2; ; CHECKPTX71-NEXT: and.b32 %r36, %r55, %r3; ; CHECKPTX71-NEXT: or.b32 %r37, %r36, %r35; -; CHECKPTX71-NEXT: atom.cas.b32 %r9, [%r1], %r55, %r37; +; CHECKPTX71-NEXT: atom.relaxed.cas.b32 %r9, [%r1], %r55, %r37; ; CHECKPTX71-NEXT: setp.ne.s32 %p2, %r9, %r55; ; CHECKPTX71-NEXT: mov.u32 %r55, %r9; ; CHECKPTX71-NEXT: @%p2 bra $L__BB0_3; @@ -109,7 +109,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat ; CHECKPTX71-NEXT: shl.b32 %r43, %r42, %r11; ; CHECKPTX71-NEXT: and.b32 %r44, %r56, %r12; ; CHECKPTX71-NEXT: or.b32 %r45, %r44, %r43; -; CHECKPTX71-NEXT: atom.global.cas.b32 %r15, [%r10], %r56, %r45; +; CHECKPTX71-NEXT: atom.relaxed.global.cas.b32 %r15, [%r10], %r56, %r45; ; CHECKPTX71-NEXT: setp.ne.s32 %p3, %r15, %r56; ; CHECKPTX71-NEXT: mov.u32 %r56, %r15; ; CHECKPTX71-NEXT: @%p3 bra $L__BB0_5; @@ -131,7 +131,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat ; CHECKPTX71-NEXT: shl.b32 %r51, %r50, %r17; ; CHECKPTX71-NEXT: and.b32 %r52, %r57, %r18; ; CHECKPTX71-NEXT: or.b32 %r53, %r52, %r51; -; CHECKPTX71-NEXT: atom.shared.cas.b32 %r21, [%r16], %r57, %r53; +; CHECKPTX71-NEXT: atom.relaxed.shared.cas.b32 %r21, [%r16], %r57, %r53; ; CHECKPTX71-NEXT: setp.ne.s32 %p4, %r21, %r57; ; CHECKPTX71-NEXT: mov.u32 %r57, %r21; ; CHECKPTX71-NEXT: @%p4 bra $L__BB0_7; diff --git a/llvm/test/CodeGen/NVPTX/atomics.ll b/llvm/test/CodeGen/NVPTX/atomics.ll index ac2bd8828b205..fccc3a8844ffc 100644 --- a/llvm/test/CodeGen/NVPTX/atomics.ll +++ b/llvm/test/CodeGen/NVPTX/atomics.ll @@ -177,7 +177,7 @@ define float @atomicrmw_add_f32_generic(ptr %addr, float %val) { ; CHECK-LABEL: atomicrmw_add_f16_generic define half @atomicrmw_add_f16_generic(ptr %addr, half %val) { -; CHECK: atom.acquire.cas +; CHECK: atom.cas %ret = atomicrmw fadd ptr %addr, half %val seq_cst ret half %ret } @@ -198,14 +198,14 @@ define float @atomicrmw_add_f32_addrspace3(ptr addrspace(3) %addr, float %val) { ; CHECK-LABEL: atomic_cmpxchg_i32 define i32 @atomic_cmpxchg_i32(ptr %addr, i32 %cmp, i32 %new) { -; CHECK: atom.acquire.cas.b32 +; CHECK: atom.cas.b32 %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst ret i32 %new } ; CHECK-LABEL: atomic_cmpxchg_i64 define i64 @atomic_cmpxchg_i64(ptr %addr, i64 %cmp, i64 %new) { -; CHECK: atom.acquire.cas.b64 +; CHECK: atom.cas.b64 %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst ret i64 %new } From c2735020e08ee51ce5fcf1dce12c4f3760370127 Mon Sep 17 00:00:00 2001 From: Akshay Deodhar Date: Fri, 7 Feb 2025 01:40:49 +0000 Subject: [PATCH 04/10] expect a fence for seq_cst ordering --- llvm/test/CodeGen/NVPTX/atomics.ll | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llvm/test/CodeGen/NVPTX/atomics.ll b/llvm/test/CodeGen/NVPTX/atomics.ll index fccc3a8844ffc..b7b459c44bef5 100644 --- a/llvm/test/CodeGen/NVPTX/atomics.ll +++ b/llvm/test/CodeGen/NVPTX/atomics.ll @@ -177,6 +177,7 @@ define float @atomicrmw_add_f32_generic(ptr %addr, float %val) { ; CHECK-LABEL: atomicrmw_add_f16_generic define half @atomicrmw_add_f16_generic(ptr %addr, half %val) { +; CHECK: membar.sys ; CHECK: atom.cas %ret = atomicrmw fadd ptr %addr, half %val seq_cst ret half %ret @@ -198,6 +199,7 @@ define float @atomicrmw_add_f32_addrspace3(ptr addrspace(3) %addr, float %val) { ; CHECK-LABEL: atomic_cmpxchg_i32 define i32 @atomic_cmpxchg_i32(ptr %addr, i32 %cmp, i32 %new) { +; CHECK: membar.sys ; CHECK: atom.cas.b32 %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst ret i32 %new @@ -205,6 +207,7 @@ define i32 @atomic_cmpxchg_i32(ptr %addr, i32 %cmp, i32 %new) { ; CHECK-LABEL: atomic_cmpxchg_i64 define i64 @atomic_cmpxchg_i64(ptr %addr, i64 %cmp, i64 %new) { +; CHECK: membar.sys ; CHECK: atom.cas.b64 %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst ret i64 %new From 9a36c96cb5271c9724bd502b911ef3c77b694c0e Mon Sep 17 00:00:00 2001 From: Akshay Deodhar Date: Sat, 8 Feb 2025 00:55:40 +0000 Subject: [PATCH 05/10] address review comments --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 43 ++- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 29 +- llvm/test/CodeGen/NVPTX/atomics.ll | 362 ++++++++++++++++++-- 3 files changed, 360 insertions(+), 74 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index db5f45fb7d09c..f2b4bc3a4b561 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -5599,16 +5599,16 @@ AtomicOrdering NVPTXTargetLowering::atomicOperationOrderAfterFenceSplit( Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const { - // Specialize for cmpxchg - if (isa(Inst)) { - // Emit a fence.sc leading fence for cmpxchg seq_cst which are not emulated - if (isReleaseOrStronger(Ord)) - return Ord == AtomicOrdering::SequentiallyConsistent - ? Builder.CreateFence(AtomicOrdering::SequentiallyConsistent) - : Builder.CreateFence(AtomicOrdering::Release); - } else { + if (!isa(Inst)) return TargetLoweringBase::emitLeadingFence(Builder, Inst, Ord); - } + + // Specialize for cmpxchg + // Emit a fence.sc leading fence for cmpxchg seq_cst which are not emulated + if (isReleaseOrStronger(Ord)) + return Ord == AtomicOrdering::SequentiallyConsistent + ? Builder.CreateFence(AtomicOrdering::SequentiallyConsistent) + : Builder.CreateFence(AtomicOrdering::Release); + return nullptr; } @@ -5616,20 +5616,19 @@ Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const { // Specialize for cmpxchg - if (isa(Inst)) { - auto CASWidth = - cast( - dyn_cast(Inst)->getCompareOperand()->getType()) - ->getBitWidth(); - // Do not emit a trailing fence for cmpxchg seq_cst which are not emulated - if (isAcquireOrStronger(Ord)) - return (Ord == AtomicOrdering::SequentiallyConsistent && - CASWidth >= STI.getMinCmpXchgSizeInBits()) - ? nullptr - : Builder.CreateFence(AtomicOrdering::Acquire); - } else { + if (!isa(Inst)) return TargetLoweringBase::emitTrailingFence(Builder, Inst, Ord); - } + + auto CASWidth = + cast( + dyn_cast(Inst)->getCompareOperand()->getType()) + ->getBitWidth(); + // Do not emit a trailing fence for cmpxchg seq_cst which are not emulated + if (isAcquireOrStronger(Ord) && + (Ord != AtomicOrdering::SequentiallyConsistent || + CASWidth < STI.getMinCmpXchgSizeInBits())) + return Builder.CreateFence(AtomicOrdering::Acquire); + return nullptr; } diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 14744411bc74b..1e88753e356e5 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -2402,36 +2402,23 @@ foreach size = ["i16", "i32", "i64"] in { } } -multiclass INT_PTX_ATOM_CAS_AS { +multiclass INT_PTX_ATOM_CAS_AS preds> { // eg. with type = 32, order = .acquire, atomic_cmp_swap_without_as = atomic_cmp_swap_i32_acquire, and _GEN becomes // F_ATOMIC_3 // For SM70, PTX63 +, memory orders are supported defm _GEN: F_ATOMIC_3("i"#type), !cast("Int"#type#"Regs"), order, "", ".b"#type, ".cas", - !cast(atomic_cmp_swap_without_as#_gen), !cast("i"#type#"imm"), [hasSM<70>, hasPTX<63>]>; + !cast(atomic_cmp_swap_without_as#_gen), !cast("i"#type#"imm"), preds>; defm _GEN_USE_G: F_ATOMIC_3("i"#type), !cast("Int"#type#"Regs"), order, ".global", ".b"#type, ".cas", - !cast(atomic_cmp_swap_without_as#_gen), !cast("i"#type#"imm"), [hasSM<70>, hasPTX<63>]>; + !cast(atomic_cmp_swap_without_as#_gen), !cast("i"#type#"imm"), preds>; defm _G: F_ATOMIC_3("i"#type), !cast("Int"#type#"Regs"), order, ".global", ".b"#type, ".cas", - !cast(atomic_cmp_swap_without_as#_g), !cast("i"#type#"imm"), [hasSM<70>, hasPTX<63>]>; + !cast(atomic_cmp_swap_without_as#_g), !cast("i"#type#"imm"), preds>; defm _S: F_ATOMIC_3("i"#type), !cast("Int"#type#"Regs"), order, ".shared", ".b"#type, ".cas", - !cast(atomic_cmp_swap_without_as#_s), !cast("i"#type#"imm"), [hasSM<70>, hasPTX<63>]>; - - // For older archs, we fall back to lowering to relaxed atom.cas - defm _GEN_OLD: F_ATOMIC_3("i"#type), !cast("Int"#type#"Regs"), "", "", ".b"#type, ".cas", - !cast(atomic_cmp_swap_without_as#_gen), !cast("i"#type#"imm")>; - - defm _GEN_USE_G_OLD: F_ATOMIC_3("i"#type), !cast("Int"#type#"Regs"), "", ".global", ".b"#type, ".cas", - !cast(atomic_cmp_swap_without_as#_gen), !cast("i"#type#"imm")>; - - defm _G_OLD: F_ATOMIC_3("i"#type), !cast("Int"#type#"Regs"), "", ".global", ".b"#type, ".cas", - !cast(atomic_cmp_swap_without_as#_g), !cast("i"#type#"imm")>; - - defm _S_OLD: F_ATOMIC_3("i"#type), !cast("Int"#type#"Regs"), "", ".shared", ".b"#type, ".cas", - !cast(atomic_cmp_swap_without_as#_s), !cast("i"#type#"imm")>; + !cast(atomic_cmp_swap_without_as#_s), !cast("i"#type#"imm"), preds>; } // Define atom.cas for all combinations of size x memory order supported in PTX *and* on the hardware. @@ -2442,9 +2429,11 @@ foreach size = ["32", "64"] in { // Note that AtomicExpand will convert cmpxchg seq_cst to a cmpxchg monotonic with fences around it. foreach order = ["acquire", "release", "acq_rel", ""] in { if !eq(order, "") then { - defm INT_PTX_ATOM_CAS_#size : INT_PTX_ATOM_CAS_AS<"atomic_cmp_swap_i"#size, size, ".relaxed">; + defm INT_PTX_ATOM_CAS_#size : INT_PTX_ATOM_CAS_AS<"atomic_cmp_swap_i"#size, size, ".relaxed", [hasSM<70>, hasPTX<63>]>; + defm INT_PTX_ATOM_CAS_#size#_old : INT_PTX_ATOM_CAS_AS<"atomic_cmp_swap_i"#size, size, "", []>; } else { - defm INT_PTX_ATOM_CAS_#size#_#order : INT_PTX_ATOM_CAS_AS<"atomic_cmp_swap_i"#size#_#order, size, "."#order>; + defm INT_PTX_ATOM_CAS_#size#_#order : INT_PTX_ATOM_CAS_AS<"atomic_cmp_swap_i"#size#_#order, size, "."#order, [hasSM<70>, hasPTX<63>]>; + defm INT_PTX_ATOM_CAS_#size#_#order#_old : INT_PTX_ATOM_CAS_AS<"atomic_cmp_swap_i"#size#_#order, size, "", []>; } } } diff --git a/llvm/test/CodeGen/NVPTX/atomics.ll b/llvm/test/CodeGen/NVPTX/atomics.ll index b7b459c44bef5..2f58d279f82c3 100644 --- a/llvm/test/CodeGen/NVPTX/atomics.ll +++ b/llvm/test/CodeGen/NVPTX/atomics.ll @@ -1,47 +1,111 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_32 | FileCheck %s ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_32 | %ptxas-verify %} ; CHECK-LABEL: atom0 define i32 @atom0(ptr %addr, i32 %val) { -; CHECK: atom.add.u32 +; CHECK-LABEL: atom0( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atom0_param_0]; +; CHECK-NEXT: ld.param.u32 %r1, [atom0_param_1]; +; CHECK-NEXT: atom.add.u32 %r2, [%rd1], %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %ret = atomicrmw add ptr %addr, i32 %val seq_cst ret i32 %ret } ; CHECK-LABEL: atom1 define i64 @atom1(ptr %addr, i64 %val) { -; CHECK: atom.add.u64 +; CHECK-LABEL: atom1( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atom1_param_0]; +; CHECK-NEXT: ld.param.u64 %rd2, [atom1_param_1]; +; CHECK-NEXT: atom.add.u64 %rd3, [%rd1], %rd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; %ret = atomicrmw add ptr %addr, i64 %val seq_cst ret i64 %ret } ; CHECK-LABEL: atom2 define i32 @atom2(ptr %subr, i32 %val) { -; CHECK: neg.s32 -; CHECK: atom.add.u32 +; CHECK-LABEL: atom2( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atom2_param_0]; +; CHECK-NEXT: ld.param.u32 %r1, [atom2_param_1]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .s32 temp; +; CHECK-NEXT: neg.s32 temp, %r1; +; CHECK-NEXT: atom.add.u32 %r2, [%rd1], temp; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %ret = atomicrmw sub ptr %subr, i32 %val seq_cst ret i32 %ret } ; CHECK-LABEL: atom3 define i64 @atom3(ptr %subr, i64 %val) { -; CHECK: neg.s64 -; CHECK: atom.add.u64 +; CHECK-LABEL: atom3( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atom3_param_0]; +; CHECK-NEXT: ld.param.u64 %rd2, [atom3_param_1]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .s64 temp; +; CHECK-NEXT: neg.s64 temp, %rd2; +; CHECK-NEXT: atom.add.u64 %rd3, [%rd1], temp; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; %ret = atomicrmw sub ptr %subr, i64 %val seq_cst ret i64 %ret } ; CHECK-LABEL: atom4 define i32 @atom4(ptr %subr, i32 %val) { -; CHECK: atom.and.b32 +; CHECK-LABEL: atom4( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atom4_param_0]; +; CHECK-NEXT: ld.param.u32 %r1, [atom4_param_1]; +; CHECK-NEXT: atom.and.b32 %r2, [%rd1], %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %ret = atomicrmw and ptr %subr, i32 %val seq_cst ret i32 %ret } ; CHECK-LABEL: atom5 define i64 @atom5(ptr %subr, i64 %val) { -; CHECK: atom.and.b64 +; CHECK-LABEL: atom5( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atom5_param_0]; +; CHECK-NEXT: ld.param.u64 %rd2, [atom5_param_1]; +; CHECK-NEXT: atom.and.b64 %rd3, [%rd1], %rd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; %ret = atomicrmw and ptr %subr, i64 %val seq_cst ret i64 %ret } @@ -59,84 +123,198 @@ define i64 @atom5(ptr %subr, i64 %val) { ; CHECK-LABEL: atom8 define i32 @atom8(ptr %subr, i32 %val) { -; CHECK: atom.or.b32 +; CHECK-LABEL: atom8( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atom8_param_0]; +; CHECK-NEXT: ld.param.u32 %r1, [atom8_param_1]; +; CHECK-NEXT: atom.or.b32 %r2, [%rd1], %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %ret = atomicrmw or ptr %subr, i32 %val seq_cst ret i32 %ret } ; CHECK-LABEL: atom9 define i64 @atom9(ptr %subr, i64 %val) { -; CHECK: atom.or.b64 +; CHECK-LABEL: atom9( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atom9_param_0]; +; CHECK-NEXT: ld.param.u64 %rd2, [atom9_param_1]; +; CHECK-NEXT: atom.or.b64 %rd3, [%rd1], %rd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; %ret = atomicrmw or ptr %subr, i64 %val seq_cst ret i64 %ret } ; CHECK-LABEL: atom10 define i32 @atom10(ptr %subr, i32 %val) { -; CHECK: atom.xor.b32 +; CHECK-LABEL: atom10( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atom10_param_0]; +; CHECK-NEXT: ld.param.u32 %r1, [atom10_param_1]; +; CHECK-NEXT: atom.xor.b32 %r2, [%rd1], %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %ret = atomicrmw xor ptr %subr, i32 %val seq_cst ret i32 %ret } ; CHECK-LABEL: atom11 define i64 @atom11(ptr %subr, i64 %val) { -; CHECK: atom.xor.b64 +; CHECK-LABEL: atom11( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atom11_param_0]; +; CHECK-NEXT: ld.param.u64 %rd2, [atom11_param_1]; +; CHECK-NEXT: atom.xor.b64 %rd3, [%rd1], %rd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; %ret = atomicrmw xor ptr %subr, i64 %val seq_cst ret i64 %ret } ; CHECK-LABEL: atom12 define i32 @atom12(ptr %subr, i32 %val) { -; CHECK: atom.max.s32 +; CHECK-LABEL: atom12( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atom12_param_0]; +; CHECK-NEXT: ld.param.u32 %r1, [atom12_param_1]; +; CHECK-NEXT: atom.max.s32 %r2, [%rd1], %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %ret = atomicrmw max ptr %subr, i32 %val seq_cst ret i32 %ret } ; CHECK-LABEL: atom13 define i64 @atom13(ptr %subr, i64 %val) { -; CHECK: atom.max.s64 +; CHECK-LABEL: atom13( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atom13_param_0]; +; CHECK-NEXT: ld.param.u64 %rd2, [atom13_param_1]; +; CHECK-NEXT: atom.max.s64 %rd3, [%rd1], %rd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; %ret = atomicrmw max ptr %subr, i64 %val seq_cst ret i64 %ret } ; CHECK-LABEL: atom14 define i32 @atom14(ptr %subr, i32 %val) { -; CHECK: atom.min.s32 +; CHECK-LABEL: atom14( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atom14_param_0]; +; CHECK-NEXT: ld.param.u32 %r1, [atom14_param_1]; +; CHECK-NEXT: atom.min.s32 %r2, [%rd1], %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %ret = atomicrmw min ptr %subr, i32 %val seq_cst ret i32 %ret } ; CHECK-LABEL: atom15 define i64 @atom15(ptr %subr, i64 %val) { -; CHECK: atom.min.s64 +; CHECK-LABEL: atom15( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atom15_param_0]; +; CHECK-NEXT: ld.param.u64 %rd2, [atom15_param_1]; +; CHECK-NEXT: atom.min.s64 %rd3, [%rd1], %rd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; %ret = atomicrmw min ptr %subr, i64 %val seq_cst ret i64 %ret } ; CHECK-LABEL: atom16 define i32 @atom16(ptr %subr, i32 %val) { -; CHECK: atom.max.u32 +; CHECK-LABEL: atom16( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atom16_param_0]; +; CHECK-NEXT: ld.param.u32 %r1, [atom16_param_1]; +; CHECK-NEXT: atom.max.u32 %r2, [%rd1], %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %ret = atomicrmw umax ptr %subr, i32 %val seq_cst ret i32 %ret } ; CHECK-LABEL: atom17 define i64 @atom17(ptr %subr, i64 %val) { -; CHECK: atom.max.u64 +; CHECK-LABEL: atom17( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atom17_param_0]; +; CHECK-NEXT: ld.param.u64 %rd2, [atom17_param_1]; +; CHECK-NEXT: atom.max.u64 %rd3, [%rd1], %rd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; %ret = atomicrmw umax ptr %subr, i64 %val seq_cst ret i64 %ret } ; CHECK-LABEL: atom18 define i32 @atom18(ptr %subr, i32 %val) { -; CHECK: atom.min.u32 +; CHECK-LABEL: atom18( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atom18_param_0]; +; CHECK-NEXT: ld.param.u32 %r1, [atom18_param_1]; +; CHECK-NEXT: atom.min.u32 %r2, [%rd1], %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %ret = atomicrmw umin ptr %subr, i32 %val seq_cst ret i32 %ret } ; CHECK-LABEL: atom19 define i64 @atom19(ptr %subr, i64 %val) { -; CHECK: atom.min.u64 +; CHECK-LABEL: atom19( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atom19_param_0]; +; CHECK-NEXT: ld.param.u64 %rd2, [atom19_param_1]; +; CHECK-NEXT: atom.min.u64 %rd3, [%rd1], %rd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; %ret = atomicrmw umin ptr %subr, i64 %val seq_cst ret i64 %ret } @@ -145,7 +323,17 @@ declare float @llvm.nvvm.atomic.load.add.f32.p0(ptr %addr, float %val) ; CHECK-LABEL: atomic_add_f32_generic define float @atomic_add_f32_generic(ptr %addr, float %val) { -; CHECK: atom.add.f32 +; CHECK-LABEL: atomic_add_f32_generic( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atomic_add_f32_generic_param_0]; +; CHECK-NEXT: ld.param.f32 %f1, [atomic_add_f32_generic_param_1]; +; CHECK-NEXT: atom.add.f32 %f2, [%rd1], %f1; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: ret; %ret = call float @llvm.nvvm.atomic.load.add.f32.p0(ptr %addr, float %val) ret float %ret } @@ -154,7 +342,17 @@ declare float @llvm.nvvm.atomic.load.add.f32.p1(ptr addrspace(1) %addr, float %v ; CHECK-LABEL: atomic_add_f32_addrspace1 define float @atomic_add_f32_addrspace1(ptr addrspace(1) %addr, float %val) { -; CHECK: atom.global.add.f32 +; CHECK-LABEL: atomic_add_f32_addrspace1( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atomic_add_f32_addrspace1_param_0]; +; CHECK-NEXT: ld.param.f32 %f1, [atomic_add_f32_addrspace1_param_1]; +; CHECK-NEXT: atom.global.add.f32 %f2, [%rd1], %f1; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: ret; %ret = call float @llvm.nvvm.atomic.load.add.f32.p1(ptr addrspace(1) %addr, float %val) ret float %ret } @@ -163,52 +361,152 @@ declare float @llvm.nvvm.atomic.load.add.f32.p3(ptr addrspace(3) %addr, float %v ; CHECK-LABEL: atomic_add_f32_addrspace3 define float @atomic_add_f32_addrspace3(ptr addrspace(3) %addr, float %val) { -; CHECK: atom.shared.add.f32 +; CHECK-LABEL: atomic_add_f32_addrspace3( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atomic_add_f32_addrspace3_param_0]; +; CHECK-NEXT: ld.param.f32 %f1, [atomic_add_f32_addrspace3_param_1]; +; CHECK-NEXT: atom.shared.add.f32 %f2, [%rd1], %f1; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: ret; %ret = call float @llvm.nvvm.atomic.load.add.f32.p3(ptr addrspace(3) %addr, float %val) ret float %ret } ; CHECK-LABEL: atomicrmw_add_f32_generic define float @atomicrmw_add_f32_generic(ptr %addr, float %val) { -; CHECK: atom.add.f32 +; CHECK-LABEL: atomicrmw_add_f32_generic( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atomicrmw_add_f32_generic_param_0]; +; CHECK-NEXT: ld.param.f32 %f1, [atomicrmw_add_f32_generic_param_1]; +; CHECK-NEXT: atom.add.f32 %f2, [%rd1], %f1; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: ret; %ret = atomicrmw fadd ptr %addr, float %val seq_cst ret float %ret } ; CHECK-LABEL: atomicrmw_add_f16_generic define half @atomicrmw_add_f16_generic(ptr %addr, half %val) { -; CHECK: membar.sys -; CHECK: atom.cas +; CHECK-LABEL: atomicrmw_add_f16_generic( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<17>; +; CHECK-NEXT: .reg .f32 %f<4>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b16 %rs1, [atomicrmw_add_f16_generic_param_1]; +; CHECK-NEXT: ld.param.u64 %rd2, [atomicrmw_add_f16_generic_param_0]; +; CHECK-NEXT: and.b64 %rd1, %rd2, -4; +; CHECK-NEXT: cvt.u32.u64 %r6, %rd2; +; CHECK-NEXT: and.b32 %r7, %r6, 3; +; CHECK-NEXT: shl.b32 %r1, %r7, 3; +; CHECK-NEXT: mov.b32 %r8, 65535; +; CHECK-NEXT: shl.b32 %r9, %r8, %r1; +; CHECK-NEXT: not.b32 %r2, %r9; +; CHECK-NEXT: ld.u32 %r16, [%rd1]; +; CHECK-NEXT: cvt.f32.f16 %f2, %rs1; +; CHECK-NEXT: $L__BB22_1: // %atomicrmw.start +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: shr.u32 %r10, %r16, %r1; +; CHECK-NEXT: cvt.u16.u32 %rs2, %r10; +; CHECK-NEXT: cvt.f32.f16 %f1, %rs2; +; CHECK-NEXT: add.rn.f32 %f3, %f1, %f2; +; CHECK-NEXT: cvt.rn.f16.f32 %rs3, %f3; +; CHECK-NEXT: cvt.u32.u16 %r11, %rs3; +; CHECK-NEXT: shl.b32 %r12, %r11, %r1; +; CHECK-NEXT: and.b32 %r13, %r16, %r2; +; CHECK-NEXT: or.b32 %r14, %r13, %r12; +; CHECK-NEXT: membar.sys; +; CHECK-NEXT: atom.cas.b32 %r5, [%rd1], %r16, %r14; +; CHECK-NEXT: setp.ne.s32 %p1, %r5, %r16; +; CHECK-NEXT: mov.u32 %r16, %r5; +; CHECK-NEXT: @%p1 bra $L__BB22_1; +; CHECK-NEXT: // %bb.2: // %atomicrmw.end +; CHECK-NEXT: shr.u32 %r15, %r5, %r1; +; CHECK-NEXT: cvt.u16.u32 %rs4, %r15; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs4; +; CHECK-NEXT: ret; %ret = atomicrmw fadd ptr %addr, half %val seq_cst ret half %ret } ; CHECK-LABEL: atomicrmw_add_f32_addrspace1 define float @atomicrmw_add_f32_addrspace1(ptr addrspace(1) %addr, float %val) { -; CHECK: atom.global.add.f32 +; CHECK-LABEL: atomicrmw_add_f32_addrspace1( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atomicrmw_add_f32_addrspace1_param_0]; +; CHECK-NEXT: ld.param.f32 %f1, [atomicrmw_add_f32_addrspace1_param_1]; +; CHECK-NEXT: atom.global.add.f32 %f2, [%rd1], %f1; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: ret; %ret = atomicrmw fadd ptr addrspace(1) %addr, float %val seq_cst ret float %ret } ; CHECK-LABEL: atomicrmw_add_f32_addrspace3 define float @atomicrmw_add_f32_addrspace3(ptr addrspace(3) %addr, float %val) { -; CHECK: atom.shared.add.f32 +; CHECK-LABEL: atomicrmw_add_f32_addrspace3( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atomicrmw_add_f32_addrspace3_param_0]; +; CHECK-NEXT: ld.param.f32 %f1, [atomicrmw_add_f32_addrspace3_param_1]; +; CHECK-NEXT: atom.shared.add.f32 %f2, [%rd1], %f1; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: ret; %ret = atomicrmw fadd ptr addrspace(3) %addr, float %val seq_cst ret float %ret } ; CHECK-LABEL: atomic_cmpxchg_i32 define i32 @atomic_cmpxchg_i32(ptr %addr, i32 %cmp, i32 %new) { -; CHECK: membar.sys -; CHECK: atom.cas.b32 +; CHECK-LABEL: atomic_cmpxchg_i32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atomic_cmpxchg_i32_param_0]; +; CHECK-NEXT: membar.sys; +; CHECK-NEXT: ld.param.u32 %r1, [atomic_cmpxchg_i32_param_1]; +; CHECK-NEXT: ld.param.u32 %r2, [atomic_cmpxchg_i32_param_2]; +; CHECK-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst ret i32 %new } ; CHECK-LABEL: atomic_cmpxchg_i64 define i64 @atomic_cmpxchg_i64(ptr %addr, i64 %cmp, i64 %new) { -; CHECK: membar.sys -; CHECK: atom.cas.b64 +; CHECK-LABEL: atomic_cmpxchg_i64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atomic_cmpxchg_i64_param_0]; +; CHECK-NEXT: membar.sys; +; CHECK-NEXT: ld.param.u64 %rd2, [atomic_cmpxchg_i64_param_1]; +; CHECK-NEXT: ld.param.u64 %rd3, [atomic_cmpxchg_i64_param_2]; +; CHECK-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst ret i64 %new } From 88782d785d428b94d09ddb48c8bd82043b7132de Mon Sep 17 00:00:00 2001 From: Akshay Deodhar Date: Sat, 8 Feb 2025 01:03:54 +0000 Subject: [PATCH 06/10] explain duplicate rules --- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 1e88753e356e5..58bbb0c00cbed 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -2405,9 +2405,6 @@ foreach size = ["i16", "i32", "i64"] in { multiclass INT_PTX_ATOM_CAS_AS preds> { // eg. with type = 32, order = .acquire, atomic_cmp_swap_without_as = atomic_cmp_swap_i32_acquire, and _GEN becomes // F_ATOMIC_3 - - - // For SM70, PTX63 +, memory orders are supported defm _GEN: F_ATOMIC_3("i"#type), !cast("Int"#type#"Regs"), order, "", ".b"#type, ".cas", !cast(atomic_cmp_swap_without_as#_gen), !cast("i"#type#"imm"), preds>; @@ -2427,6 +2424,8 @@ foreach size = ["32", "64"] in { // *before* the instructions with the unspecified memory order [""]. // This ensures that `cmpxchg ... acquire` is lowered to "atom.acquire.cas" instead of "atom.cas" // Note that AtomicExpand will convert cmpxchg seq_cst to a cmpxchg monotonic with fences around it. + // Memory orders are only supported for SM70+, PTX63+- so we have two sets of instruction definitions- + // for SM70+, and "old" ones which lower to "atom.cas", for earlier archs. foreach order = ["acquire", "release", "acq_rel", ""] in { if !eq(order, "") then { defm INT_PTX_ATOM_CAS_#size : INT_PTX_ATOM_CAS_AS<"atomic_cmp_swap_i"#size, size, ".relaxed", [hasSM<70>, hasPTX<63>]>; From bb3f04417f6fa5901d4da33f67bbbe39a72372c1 Mon Sep 17 00:00:00 2001 From: Akshay Deodhar Date: Tue, 11 Feb 2025 22:09:27 +0000 Subject: [PATCH 07/10] cleanup tablegen rules --- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 76 ++++++++++++++---------- 1 file changed, 43 insertions(+), 33 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 58bbb0c00cbed..680d0ebf219f9 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -2380,15 +2380,15 @@ defm INT_PTX_ATOM_XOR_GEN_64_USE_G : F_ATOMIC_2(NAME) node:$ptr, node:$cmp, node:$val), AS_match.generic>; - def NAME#_g: PatFrag<(ops node:$ptr, node:$cmp, node:$val), + def NAME#_global: PatFrag<(ops node:$ptr, node:$cmp, node:$val), (!cast(NAME) node:$ptr, node:$cmp, node:$val), AS_match.global>; - def NAME#_s: PatFrag<(ops node:$ptr, node:$cmp, node:$val), + def NAME#_shared: PatFrag<(ops node:$ptr, node:$cmp, node:$val), (!cast(NAME) node:$ptr, node:$cmp, node:$val), AS_match.shared>; } @@ -2405,47 +2405,57 @@ foreach size = ["i16", "i32", "i64"] in { multiclass INT_PTX_ATOM_CAS_AS preds> { // eg. with type = 32, order = .acquire, atomic_cmp_swap_without_as = atomic_cmp_swap_i32_acquire, and _GEN becomes // F_ATOMIC_3 - defm _GEN: F_ATOMIC_3("i"#type), !cast("Int"#type#"Regs"), order, "", ".b"#type, ".cas", - !cast(atomic_cmp_swap_without_as#_gen), !cast("i"#type#"imm"), preds>; - - defm _GEN_USE_G: F_ATOMIC_3("i"#type), !cast("Int"#type#"Regs"), order, ".global", ".b"#type, ".cas", - !cast(atomic_cmp_swap_without_as#_gen), !cast("i"#type#"imm"), preds>; - - defm _G: F_ATOMIC_3("i"#type), !cast("Int"#type#"Regs"), order, ".global", ".b"#type, ".cas", - !cast(atomic_cmp_swap_without_as#_g), !cast("i"#type#"imm"), preds>; - - defm _S: F_ATOMIC_3("i"#type), !cast("Int"#type#"Regs"), order, ".shared", ".b"#type, ".cas", - !cast(atomic_cmp_swap_without_as#_s), !cast("i"#type#"imm"), preds>; -} + defm _generic: F_ATOMIC_3("i"#type), !cast("Int"#type#"Regs"), order, "", ".b"#type, ".cas", + !cast(atomic_cmp_swap_without_as#_generic), !cast("i"#type#"imm"), preds>; + + defm _global: F_ATOMIC_3("i"#type), !cast("Int"#type#"Regs"), order, ".global", ".b"#type, ".cas", + !cast(atomic_cmp_swap_without_as#_global), !cast("i"#type#"imm"), preds>; + + defm _shared: F_ATOMIC_3("i"#type), !cast("Int"#type#"Regs"), order, ".shared", ".b"#type, ".cas", + !cast(atomic_cmp_swap_without_as#_shared), !cast("i"#type#"imm"), preds>; +} + +// eg. with type = 32, order = .acquire, atomic_cmp_swap_pat = atomic_cmp_swap_i32_acquire, and _GEN becomes +// F_ATOMIC_3 +multiclass INT_PTX_ATOM_CAS preds> + : F_ATOMIC_3("i"#type), + !cast("Int"#type#"Regs"), + order, + as, + ".b"#type, + ".cas", + !cast(atomic_cmp_swap_pat), + !cast("i"#type#"imm"), + preds>; // Define atom.cas for all combinations of size x memory order supported in PTX *and* on the hardware. foreach size = ["32", "64"] in { - // We enumerate the instructions with specific memory orders ["acquire", "release", "acq_rel"] - // *before* the instructions with the unspecified memory order [""]. - // This ensures that `cmpxchg ... acquire` is lowered to "atom.acquire.cas" instead of "atom.cas" - // Note that AtomicExpand will convert cmpxchg seq_cst to a cmpxchg monotonic with fences around it. - // Memory orders are only supported for SM70+, PTX63+- so we have two sets of instruction definitions- - // for SM70+, and "old" ones which lower to "atom.cas", for earlier archs. - foreach order = ["acquire", "release", "acq_rel", ""] in { - if !eq(order, "") then { - defm INT_PTX_ATOM_CAS_#size : INT_PTX_ATOM_CAS_AS<"atomic_cmp_swap_i"#size, size, ".relaxed", [hasSM<70>, hasPTX<63>]>; - defm INT_PTX_ATOM_CAS_#size#_old : INT_PTX_ATOM_CAS_AS<"atomic_cmp_swap_i"#size, size, "", []>; - } else { - defm INT_PTX_ATOM_CAS_#size#_#order : INT_PTX_ATOM_CAS_AS<"atomic_cmp_swap_i"#size#_#order, size, "."#order, [hasSM<70>, hasPTX<63>]>; - defm INT_PTX_ATOM_CAS_#size#_#order#_old : INT_PTX_ATOM_CAS_AS<"atomic_cmp_swap_i"#size#_#order, size, "", []>; + foreach as = ["generic", "global", "shared"] in { + defvar cas_as_string = !if(!eq(as, "generic"), "", "."#as); + foreach order = ["acquire", "release", "acq_rel", "monotonic"] in { + defvar order_as_string = !if(!eq(order, "monotonic"), ".relaxed", "."#order); + // Note that AtomicExpand will convert cmpxchg seq_cst to a cmpxchg monotonic with fences around it. + // Memory orders are only supported for SM70+, PTX63+- so we have two sets of instruction definitions- + // for SM70+, and "old" ones which lower to "atom.cas", for earlier archs. + defm INT_PTX_ATOM_CAS_#size#_#order#as + : INT_PTX_ATOM_CAS<"atomic_cmp_swap_i"#size#_#order#_#as, size, + order_as_string, cas_as_string, + [hasSM<70>, hasPTX<63>]>; + defm INT_PTX_ATOM_CAS_#size#_#order#_old#as + : INT_PTX_ATOM_CAS<"atomic_cmp_swap_i"#size#_#order#_#as, size, + "", cas_as_string, []>; } } } // Note that 16-bit CAS support in PTX is *emulated*. (TODO: public?) defm INT_PTX_ATOM_CAS_G_16 : F_ATOMIC_3, hasPTX<63>]>; + atomic_cmp_swap_i16_global, i16imm, [hasSM<70>, hasPTX<63>]>; defm INT_PTX_ATOM_CAS_S_16 : F_ATOMIC_3, hasPTX<63>]>; + atomic_cmp_swap_i16_shared, i16imm, [hasSM<70>, hasPTX<63>]>; defm INT_PTX_ATOM_CAS_GEN_16 : F_ATOMIC_3, hasPTX<63>]>; -defm INT_PTX_ATOM_CAS_GEN_16_USE_G : F_ATOMIC_3, hasPTX<63>]>; + atomic_cmp_swap_i16_generic, i16imm, [hasSM<70>, hasPTX<63>]>; // Support for scoped atomic operations. Matches // int_nvvm_atomic_{op}_{space}_{type}_{scope} From a37ba61a48af8259020bc3f9cdfb59e4fc343516 Mon Sep 17 00:00:00 2001 From: Akshay Deodhar Date: Tue, 11 Feb 2025 22:25:56 +0000 Subject: [PATCH 08/10] renaming and cleanup --- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 28 +++++++++++++----------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 680d0ebf219f9..c9a73fae20caf 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -2415,14 +2415,16 @@ multiclass INT_PTX_ATOM_CAS_AS(atomic_cmp_swap_without_as#_shared), !cast("i"#type#"imm"), preds>; } -// eg. with type = 32, order = .acquire, atomic_cmp_swap_pat = atomic_cmp_swap_i32_acquire, and _GEN becomes -// F_ATOMIC_3 +// eg. with type = 32, order = .acquire, addrspace = ".global", +// atomic_cmp_swap_pat = atomic_cmp_swap_i32_acquire_global. +// F_ATOMIC_3 multiclass INT_PTX_ATOM_CAS preds> + string order, string addrspace, list preds> : F_ATOMIC_3("i"#type), !cast("Int"#type#"Regs"), order, - as, + addrspace, ".b"#type, ".cas", !cast(atomic_cmp_swap_pat), @@ -2431,20 +2433,20 @@ multiclass INT_PTX_ATOM_CAS, hasPTX<63>]>; - defm INT_PTX_ATOM_CAS_#size#_#order#_old#as - : INT_PTX_ATOM_CAS<"atomic_cmp_swap_i"#size#_#order#_#as, size, - "", cas_as_string, []>; + defm INT_PTX_ATOM_CAS_#size#_#order#_old#addrspace + : INT_PTX_ATOM_CAS<"atomic_cmp_swap_i"#size#_#order#_#addrspace, size, + "", cas_addrspace_string, []>; } } } From 645d2eabca7087c6c620ca9d53b831a68a6f647f Mon Sep 17 00:00:00 2001 From: Akshay Deodhar Date: Tue, 11 Feb 2025 22:29:53 +0000 Subject: [PATCH 09/10] nits --- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index c9a73fae20caf..10d4da2a99c35 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -2415,10 +2415,12 @@ multiclass INT_PTX_ATOM_CAS_AS(atomic_cmp_swap_without_as#_shared), !cast("i"#type#"imm"), preds>; } -// eg. with type = 32, order = .acquire, addrspace = ".global", +// eg. with type = 32, order = ".acquire", addrspace = ".global", // atomic_cmp_swap_pat = atomic_cmp_swap_i32_acquire_global. +// preds = [hasSM<70>, hasPTX<63>] // F_ATOMIC_3 +// ".cas", atomic_cmp_swap_i32_acquire_global, i32imm, +// [hasSM<70>, hasPTX<63>]> multiclass INT_PTX_ATOM_CAS preds> : F_ATOMIC_3("i"#type), @@ -2431,7 +2433,8 @@ multiclass INT_PTX_ATOM_CAS("i"#type#"imm"), preds>; -// Define atom.cas for all combinations of size x memory order supported in PTX *and* on the hardware. +// Define atom.cas for all combinations of size x addrspace x memory order +// supported in PTX *and* on the hardware. foreach size = ["32", "64"] in { foreach addrspace = ["generic", "global", "shared"] in { defvar cas_addrspace_string = !if(!eq(addrspace, "generic"), "", "."#addrspace); @@ -2451,7 +2454,7 @@ foreach size = ["32", "64"] in { } } -// Note that 16-bit CAS support in PTX is *emulated*. (TODO: public?) +// Note that 16-bit CAS support in PTX is emulated. defm INT_PTX_ATOM_CAS_G_16 : F_ATOMIC_3, hasPTX<63>]>; defm INT_PTX_ATOM_CAS_S_16 : F_ATOMIC_3 Date: Tue, 11 Feb 2025 22:34:16 +0000 Subject: [PATCH 10/10] remove dead code --- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 10d4da2a99c35..ef4c942a49a92 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -2402,19 +2402,6 @@ foreach size = ["i16", "i32", "i64"] in { } } -multiclass INT_PTX_ATOM_CAS_AS preds> { - // eg. with type = 32, order = .acquire, atomic_cmp_swap_without_as = atomic_cmp_swap_i32_acquire, and _GEN becomes - // F_ATOMIC_3 - defm _generic: F_ATOMIC_3("i"#type), !cast("Int"#type#"Regs"), order, "", ".b"#type, ".cas", - !cast(atomic_cmp_swap_without_as#_generic), !cast("i"#type#"imm"), preds>; - - defm _global: F_ATOMIC_3("i"#type), !cast("Int"#type#"Regs"), order, ".global", ".b"#type, ".cas", - !cast(atomic_cmp_swap_without_as#_global), !cast("i"#type#"imm"), preds>; - - defm _shared: F_ATOMIC_3("i"#type), !cast("Int"#type#"Regs"), order, ".shared", ".b"#type, ".cas", - !cast(atomic_cmp_swap_without_as#_shared), !cast("i"#type#"imm"), preds>; -} - // eg. with type = 32, order = ".acquire", addrspace = ".global", // atomic_cmp_swap_pat = atomic_cmp_swap_i32_acquire_global. // preds = [hasSM<70>, hasPTX<63>]