diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp index 443db4391a523..edc8e33559d97 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp @@ -268,8 +268,8 @@ void NVPTXInstPrinter::printCmpMode(const MCInst *MI, int OpNum, raw_ostream &O, llvm_unreachable("Empty Modifier"); } -void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum, - raw_ostream &O, StringRef Modifier) { +void NVPTXInstPrinter::printAtomicCode(const MCInst *MI, int OpNum, + raw_ostream &O, StringRef Modifier) { const MCOperand &MO = MI->getOperand(OpNum); int Imm = (int)MO.getImm(); if (Modifier == "sem") { @@ -286,6 +286,12 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum, case NVPTX::Ordering::Release: O << ".release"; return; + case NVPTX::Ordering::AcquireRelease: + O << ".acq_rel"; + return; + case NVPTX::Ordering::SequentiallyConsistent: + O << ".seq_cst"; + return; case NVPTX::Ordering::Volatile: O << ".volatile"; return; @@ -294,14 +300,14 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum, return; default: report_fatal_error(formatv( - "NVPTX LdStCode Printer does not support \"{}\" sem modifier. " - "Loads/Stores cannot be AcquireRelease or SequentiallyConsistent.", + "NVPTX AtomicCode Printer does not support \"{}\" sem modifier. ", OrderingToString(Ordering))); } } else if (Modifier == "scope") { auto S = NVPTX::Scope(Imm); switch (S) { case NVPTX::Scope::Thread: + case NVPTX::Scope::DefaultDevice: return; case NVPTX::Scope::System: O << ".sys"; @@ -316,9 +322,9 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum, O << ".gpu"; return; } - report_fatal_error( - formatv("NVPTX LdStCode Printer does not support \"{}\" sco modifier.", - ScopeToString(S))); + report_fatal_error(formatv( + "NVPTX AtomicCode Printer does not support \"{}\" scope modifier.", + ScopeToString(S))); } else if (Modifier == "addsp") { auto A = NVPTX::AddressSpace(Imm); switch (A) { @@ -334,7 +340,7 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum, return; } report_fatal_error(formatv( - "NVPTX LdStCode Printer does not support \"{}\" addsp modifier.", + "NVPTX AtomicCode Printer does not support \"{}\" addsp modifier.", AddressSpaceToString(A))); } else if (Modifier == "sign") { switch (Imm) { diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h index 193c436939f66..c3ff3469150e4 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h @@ -40,8 +40,8 @@ class NVPTXInstPrinter : public MCInstPrinter { StringRef Modifier = {}); void printCmpMode(const MCInst *MI, int OpNum, raw_ostream &O, StringRef Modifier = {}); - void printLdStCode(const MCInst *MI, int OpNum, raw_ostream &O, - StringRef Modifier = {}); + void printAtomicCode(const MCInst *MI, int OpNum, raw_ostream &O, + StringRef Modifier = {}); void printMmaCode(const MCInst *MI, int OpNum, raw_ostream &O, StringRef Modifier = {}); void printMemOperand(const MCInst *MI, int OpNum, raw_ostream &O, diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h index 15997bc3878d8..180ce4ab02a27 100644 --- a/llvm/lib/Target/NVPTX/NVPTX.h +++ b/llvm/lib/Target/NVPTX/NVPTX.h @@ -174,7 +174,8 @@ enum Scope : ScopeUnderlyingType { Cluster = 2, Device = 3, System = 4, - LASTSCOPE = System + DefaultDevice = 5, // For SM < 70: denotes PTX op implicit/default .gpu scope + LASTSCOPE = DefaultDevice }; using AddressSpaceUnderlyingType = unsigned int; diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index ae73d8da79f8e..65e7c56774547 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -494,7 +494,7 @@ bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) { return true; } -static std::optional convertAS(unsigned AS) { +static std::optional convertAS(unsigned AS) { switch (AS) { case llvm::ADDRESS_SPACE_LOCAL: return NVPTX::AddressSpace::Local; @@ -515,11 +515,42 @@ static std::optional convertAS(unsigned AS) { } } -static unsigned int getCodeAddrSpace(const MemSDNode *N) { +NVPTX::AddressSpace NVPTXDAGToDAGISel::getAddrSpace(const MemSDNode *N) { return convertAS(N->getMemOperand()->getAddrSpace()) .value_or(NVPTX::AddressSpace::Generic); } +NVPTX::Ordering NVPTXDAGToDAGISel::getMemOrder(const MemSDNode *N) const { + // No "sem" orderings for SM/PTX versions which do not support memory ordering + if (!Subtarget->hasMemoryOrdering()) + return NVPTX::Ordering::NotAtomic; + auto Ordering = N->getMergedOrdering(); + switch (Ordering) { + case AtomicOrdering::NotAtomic: + return NVPTX::Ordering::NotAtomic; + case AtomicOrdering::Unordered: + case AtomicOrdering::Monotonic: + return NVPTX::Ordering::Relaxed; + case AtomicOrdering::Acquire: + return NVPTX::Ordering::Acquire; + case AtomicOrdering::Release: + return NVPTX::Ordering::Release; + case AtomicOrdering::AcquireRelease: + return NVPTX::Ordering::AcquireRelease; + case AtomicOrdering::SequentiallyConsistent: + return NVPTX::Ordering::SequentiallyConsistent; + } + llvm_unreachable("Invalid atomic ordering"); +} + +NVPTX::Scope NVPTXDAGToDAGISel::getAtomicScope(const MemSDNode *N) const { + // No "scope" modifier for SM/PTX versions which do not support scoped atomics + // Functionally, these atomics are at device scope + if (!Subtarget->hasAtomScope()) + return NVPTX::Scope::DefaultDevice; + return Scopes[N->getSyncScopeID()]; +} + namespace { struct OperationOrderings { @@ -532,7 +563,7 @@ struct OperationOrderings { static OperationOrderings getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) { AtomicOrdering Ordering = N->getSuccessOrdering(); - auto CodeAddrSpace = getCodeAddrSpace(N); + auto CodeAddrSpace = NVPTXDAGToDAGISel::getAddrSpace(N); bool HasMemoryOrdering = Subtarget->hasMemoryOrdering(); bool HasRelaxedMMIO = Subtarget->hasRelaxedMMIO(); @@ -756,7 +787,7 @@ NVPTX::Scope NVPTXDAGToDAGISel::getOperationScope(MemSDNode *N, } static bool canLowerToLDG(const MemSDNode &N, const NVPTXSubtarget &Subtarget, - unsigned CodeAddrSpace) { + NVPTX::AddressSpace CodeAddrSpace) { // We use ldg (i.e. ld.global.nc) for invariant loads from the global address // space. return Subtarget.hasLDG() && CodeAddrSpace == NVPTX::AddressSpace::Global && @@ -788,6 +819,7 @@ static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S, return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_gpu : NVPTX::INT_MEMBAR_GL; case NVPTX::Scope::Thread: + case NVPTX::Scope::DefaultDevice: report_fatal_error( formatv("Unsupported scope \"{}\" for acquire/release/acq_rel fence.", ScopeToString(S))); @@ -807,6 +839,7 @@ static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S, return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_gpu : NVPTX::INT_MEMBAR_GL; case NVPTX::Scope::Thread: + case NVPTX::Scope::DefaultDevice: report_fatal_error( formatv("Unsupported scope \"{}\" for acquire/release/acq_rel fence.", ScopeToString(S))); @@ -826,6 +859,7 @@ static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S, return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_gpu : NVPTX::INT_MEMBAR_GL; case NVPTX::Scope::Thread: + case NVPTX::Scope::DefaultDevice: report_fatal_error( formatv("Unsupported scope \"{}\" for acquire/release/acq_rel fence.", ScopeToString(S))); @@ -846,6 +880,7 @@ static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S, return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_gpu : NVPTX::INT_MEMBAR_GL; case NVPTX::Scope::Thread: + case NVPTX::Scope::DefaultDevice: report_fatal_error(formatv("Unsupported scope \"{}\" for seq_cst fence.", ScopeToString(S))); } @@ -1025,7 +1060,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { const MVT LoadedVT = LoadedEVT.getSimpleVT(); // Address Space Setting - const unsigned CodeAddrSpace = getCodeAddrSpace(LD); + const auto CodeAddrSpace = getAddrSpace(LD); if (canLowerToLDG(*LD, *Subtarget, CodeAddrSpace)) return tryLDG(LD); @@ -1097,7 +1132,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { const MVT MemVT = MemEVT.getSimpleVT(); // Address Space Setting - const unsigned CodeAddrSpace = getCodeAddrSpace(LD); + const auto CodeAddrSpace = getAddrSpace(LD); if (canLowerToLDG(*LD, *Subtarget, CodeAddrSpace)) return tryLDG(LD); @@ -1313,7 +1348,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { return false; // Address Space Setting - const unsigned CodeAddrSpace = getCodeAddrSpace(ST); + const auto CodeAddrSpace = getAddrSpace(ST); SDLoc DL(ST); SDValue Chain = ST->getChain(); @@ -1363,7 +1398,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { assert(StoreVT.isSimple() && "Store value is not simple"); // Address Space Setting - const unsigned CodeAddrSpace = getCodeAddrSpace(ST); + const auto CodeAddrSpace = getAddrSpace(ST); if (CodeAddrSpace == NVPTX::AddressSpace::Const) { report_fatal_error("Cannot store to pointer that points to constant " "memory space"); diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index 88e5328ff69c5..b99b4ef2d3076 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -100,6 +100,8 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) { return CurDAG->getTargetConstant(Imm, DL, MVT::i32); } + NVPTX::Ordering getMemOrder(const MemSDNode *N) const; + NVPTX::Scope getAtomicScope(const MemSDNode *N) const; bool SelectADDR(SDValue Addr, SDValue &Base, SDValue &Offset); SDValue getPTXCmpMode(const CondCodeSDNode &CondCode); @@ -114,6 +116,9 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { std::pair insertMemoryInstructionFence(SDLoc DL, SDValue &Chain, MemSDNode *N); NVPTX::Scope getOperationScope(MemSDNode *N, NVPTX::Ordering O) const; + +public: + static NVPTX::AddressSpace getAddrSpace(const MemSDNode *N); }; class NVPTXDAGToDAGISelLegacy : public SelectionDAGISelLegacy { diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 14f05250ad6b8..d017c658c53a3 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -6315,10 +6315,12 @@ Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder, // Specialize for cmpxchg // Emit a fence.sc leading fence for cmpxchg seq_cst which are not emulated + SyncScope::ID SSID = cast(Inst)->getSyncScopeID(); if (isReleaseOrStronger(Ord)) - return Ord == AtomicOrdering::SequentiallyConsistent - ? Builder.CreateFence(AtomicOrdering::SequentiallyConsistent) - : Builder.CreateFence(AtomicOrdering::Release); + return Builder.CreateFence(Ord == AtomicOrdering::SequentiallyConsistent + ? Ord + : AtomicOrdering::Release, + SSID); return nullptr; } @@ -6330,15 +6332,15 @@ Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder, if (!isa(Inst)) return TargetLoweringBase::emitTrailingFence(Builder, Inst, Ord); + auto *CI = cast(Inst); auto CASWidth = - cast( - dyn_cast(Inst)->getCompareOperand()->getType()) - ->getBitWidth(); + cast(CI->getCompareOperand()->getType())->getBitWidth(); + SyncScope::ID SSID = CI->getSyncScopeID(); // Do not emit a trailing fence for cmpxchg seq_cst which are not emulated if (isAcquireOrStronger(Ord) && (Ord != AtomicOrdering::SequentiallyConsistent || CASWidth < STI.getMinCmpXchgSizeInBits())) - return Builder.CreateFence(AtomicOrdering::Acquire); + return Builder.CreateFence(AtomicOrdering::Acquire, SSID); return nullptr; } diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index ecae03e77aa83..4eef6c939720c 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -1608,8 +1608,8 @@ def ADDR : Operand { let MIOperandInfo = (ops ADDR_base, i32imm); } -def LdStCode : Operand { - let PrintMethod = "printLdStCode"; +def AtomicCode : Operand { + let PrintMethod = "printAtomicCode"; } def MmaCode : Operand { @@ -1962,7 +1962,7 @@ defm ProxyRegB64 : ProxyRegInst<"b64", B64>; class LD : NVPTXInst< (outs regclass:$dst), - (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Sign, + (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$addr), "ld${sem:sem}${scope:scope}${addsp:addsp}.${Sign:sign}$fromWidth " "\t$dst, [$addr];", []>; @@ -1978,7 +1978,7 @@ class ST : NVPTXInst< (outs), (ins O:$src, - LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, i32imm:$toWidth, + AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, i32imm:$toWidth, ADDR:$addr), "st${sem:sem}${scope:scope}${addsp:addsp}.b$toWidth" " \t[$addr], $src;", []>; @@ -1996,21 +1996,21 @@ let mayStore=1, hasSideEffects=0 in { multiclass LD_VEC { def _v2 : NVPTXInst< (outs regclass:$dst1, regclass:$dst2), - (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, - LdStCode:$Sign, i32imm:$fromWidth, ADDR:$addr), + (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, + AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$addr), "ld${sem:sem}${scope:scope}${addsp:addsp}.v2.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2}}, [$addr];", []>; def _v4 : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, - LdStCode:$Sign, i32imm:$fromWidth, ADDR:$addr), + (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, + AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$addr), "ld${sem:sem}${scope:scope}${addsp:addsp}.v4.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; if support_v8 then def _v8 : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4, regclass:$dst5, regclass:$dst6, regclass:$dst7, regclass:$dst8), - (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Sign, + (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$addr), "ld${sem:sem}${scope:scope}${addsp:addsp}.v8.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2, $dst3, $dst4, $dst5, $dst6, $dst7, $dst8}}, " @@ -2027,14 +2027,14 @@ multiclass ST_VEC { def _v2 : NVPTXInst< (outs), (ins O:$src1, O:$src2, - LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, i32imm:$fromWidth, + AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, i32imm:$fromWidth, ADDR:$addr), "st${sem:sem}${scope:scope}${addsp:addsp}.v2.b$fromWidth " "\t[$addr], {{$src1, $src2}};", []>; def _v4 : NVPTXInst< (outs), (ins O:$src1, O:$src2, O:$src3, O:$src4, - LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, i32imm:$fromWidth, + AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, i32imm:$fromWidth, ADDR:$addr), "st${sem:sem}${scope:scope}${addsp:addsp}.v4.b$fromWidth " "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; @@ -2043,7 +2043,7 @@ multiclass ST_VEC { (outs), (ins O:$src1, O:$src2, O:$src3, O:$src4, O:$src5, O:$src6, O:$src7, O:$src8, - LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, i32imm:$fromWidth, + AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, i32imm:$fromWidth, ADDR:$addr), "st${sem:sem}${scope:scope}${addsp:addsp}.v8.b$fromWidth " "\t[$addr], " diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 93827be5c2811..bad4c3c4c5f3a 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -41,6 +41,46 @@ def AS_match { }]; } + +//===----------------------------------------------------------------------===// +// NVPTX Scope Constants +// These map to the Scope enum in NVPTX.h +//===----------------------------------------------------------------------===// + +def Scope_thread : PatLeaf<(i32 0)>; // Thread = 0 +def Scope_cta : PatLeaf<(i32 1)>; // Block = 1 +def Scope_cluster : PatLeaf<(i32 2)>; // Cluster = 2 +def Scope_device : PatLeaf<(i32 3)>; // Device = 3 +def Scope_sys : PatLeaf<(i32 4)>; // System = 4 + +//===----------------------------------------------------------------------===// +// NVPTX Address Space Constants +// These map to the AddressSpace enum in NVPTX.h +//===----------------------------------------------------------------------===// + +def AddrSpace_gen : PatLeaf<(i32 0)>; // Generic = 0 +def AddrSpace_global : PatLeaf<(i32 1)>; // Global = 1 +def AddrSpace_shared : PatLeaf<(i32 3)>; // Shared = 3 +def AddrSpace_const : PatLeaf<(i32 4)>; // Const = 4 +def AddrSpace_local : PatLeaf<(i32 5)>; // Local = 5 +def AddrSpace_shared_cluster : PatLeaf<(i32 7)>; // SharedCluster = 7 +def AddrSpace_param : PatLeaf<(i32 101)>; // Param = 101 + +//===----------------------------------------------------------------------===// +// NVPTX Ordering Constants +// These map to the Ordering enum in NVPTX.h +//===----------------------------------------------------------------------===// + +def Ordering_not_atomic : PatLeaf<(i32 0)>; // NotAtomic = 0 +def Ordering_relaxed : PatLeaf<(i32 2)>; // Relaxed = 1 +def Ordering_acquire : PatLeaf<(i32 4)>; // Acquire = 4 +def Ordering_release : PatLeaf<(i32 5)>; // Release = 5 +def Ordering_acquire_release : PatLeaf<(i32 6)>; // AcquireRelease = 6 +def Ordering_sequentially_consistent : PatLeaf<(i32 7)>; // SequentiallyConsistent = 7 +def Ordering_volatile : PatLeaf<(i32 8)>; // Volatile = 8 +def Ordering_relaxed_mmio : PatLeaf<(i32 9)>; // RelaxedMMIO = 9 + + // A node that will be replaced with the current PTX version. class PTX { SDNodeXForm PTXVerXform = SDNodeXForm preds> { - defvar asm_str = "atom" # sem_str # as_str # "." # op_str; +multiclass F_ATOMIC_3 { + defvar asm_str = "atom${sem:sem}${scope:scope}${addsp:addsp}" # op_str; + let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in { - def rr : BasicNVPTXInst<(outs t.RC:$dst), - (ins ADDR:$addr, t.RC:$b, t.RC:$c), - asm_str, - [(set t.Ty:$dst, (op addr:$addr, t.Ty:$b, t.Ty:$c))]>, - Requires; + def _rr : BasicFlagsNVPTXInst<(outs t.RC:$dst), + (ins ADDR:$addr, t.RC:$b, t.RC:$c), (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp), + asm_str>; - def ir : BasicNVPTXInst<(outs t.RC:$dst), - (ins ADDR:$addr, t.Imm:$b, t.RC:$c), - asm_str, - [(set t.Ty:$dst, (op addr:$addr, (t.Ty t.ImmNode:$b), t.Ty:$c))]>, - Requires; + def _ir : BasicFlagsNVPTXInst<(outs t.RC:$dst), + (ins ADDR:$addr, t.Imm:$b, t.RC:$c), (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp), + asm_str>; - def ri : BasicNVPTXInst<(outs t.RC:$dst), - (ins ADDR:$addr, t.RC:$b, t.Imm:$c), - asm_str, - [(set t.Ty:$dst, (op addr:$addr, t.Ty:$b, (t.Ty t.ImmNode:$c)))]>, - Requires; + def _ri : BasicFlagsNVPTXInst<(outs t.RC:$dst), + (ins ADDR:$addr, t.RC:$b, t.Imm:$c), (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp), + asm_str>; - def ii : BasicNVPTXInst<(outs t.RC:$dst), - (ins ADDR:$addr, t.Imm:$b, t.Imm:$c), - asm_str, - [(set t.Ty:$dst, (op addr:$addr, (t.Ty t.ImmNode:$b), (t.Ty t.ImmNode:$c)))]>, - Requires; + def _ii : BasicFlagsNVPTXInst<(outs t.RC:$dst), + (ins ADDR:$addr, t.Imm:$b, t.Imm:$c), (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp), + asm_str>; } + + defvar GetSem = SDNodeXForm(N)), SDLoc(N)); + }]>; + + defvar GetScope = SDNodeXForm(N)), SDLoc(N)); + }]>; + + defvar GetAddSp = SDNodeXForm(N)), SDLoc(N)); + }]>; + + def : Pat<(op:$this addr:$addr, t.Ty:$b, t.Ty:$c), + (!cast(NAME # _rr) ADDR:$addr, t.Ty:$b, t.Ty:$c, (GetSem $this), (GetScope $this), (GetAddSp $this))>; + + def : Pat<(op:$this addr:$addr, (t.Ty t.ImmNode:$b), t.Ty:$c), + (!cast(NAME # _ir) ADDR:$addr, (t.Ty t.ImmNode:$b), t.Ty:$c, (GetSem $this), (GetScope $this), (GetAddSp $this))>; + + def : Pat<(op:$this addr:$addr, t.Ty:$b, (t.Ty t.ImmNode:$c)), + (!cast(NAME # _ri) ADDR:$addr, t.Ty:$b, (t.Ty t.ImmNode:$c), (GetSem $this), (GetScope $this), (GetAddSp $this))>; + + def : Pat<(op:$this addr:$addr, (t.Ty t.ImmNode:$b), (t.Ty t.ImmNode:$c)), + (!cast(NAME # _ii) ADDR:$addr, (t.Ty t.ImmNode:$b), (t.Ty t.ImmNode:$c), (GetSem $this), (GetScope $this), (GetAddSp $this))>; } multiclass F_ATOMIC_2_AS preds = []> { @@ -1899,14 +1954,6 @@ multiclass F_ATOMIC_2_AS, preds>; } -multiclass F_ATOMIC_3_AS preds = []> { - defvar frag_pat = (frag node:$a, node:$b, node:$c); - defm _G : F_ATOMIC_3, preds>; - defm _S : F_ATOMIC_3, preds>; - defm _S_C : F_ATOMIC_3, !listconcat([hasClusters], preds)>; - defm _GEN : F_ATOMIC_3, preds>; -} - // atom_add defm INT_PTX_ATOM_ADD_32 : F_ATOMIC_2_AS; defm INT_PTX_ATOM_ADD_64 : F_ATOMIC_2_AS; @@ -1951,23 +1998,12 @@ defm INT_PTX_ATOM_XOR_64 : F_ATOMIC_2_AS("atomic_cmp_swap_i"#t.Size#_#order); - // Note that AtomicExpand will convert cmpxchg seq_cst to a cmpxchg monotonic with fences around it. - // Memory orders are only supported for SM70+, PTX63+- so we have two sets of instruction definitions- - // for SM70+, and "old" ones which lower to "atom.cas", for earlier archs. - defm INT_PTX_ATOM_CAS_#t.Size#_#order - : F_ATOMIC_3_AS, hasPTX<63>]>; - defm INT_PTX_ATOM_CAS_#t.Size#_#order#_old - : F_ATOMIC_3_AS; - } +foreach t = [I16RT, I32RT, I64RT] in { + defvar atomic_cmp_swap_pat = !cast("atomic_cmp_swap_i"#t.Size); + defm INT_PTX_ATOM_CAS_#t.Size + : F_ATOMIC_3; } -// Note that 16-bit CAS support in PTX is emulated. -defm INT_PTX_ATOM_CAS_16 : F_ATOMIC_3_AS, hasPTX<63>]>; - // Support for scoped atomic operations. Matches // int_nvvm_atomic_{op}_{space}_{type}_{scope} // and converts it into the appropriate instruction. @@ -1991,19 +2027,6 @@ multiclass ATOM2N_impl; } -multiclass ATOM3N_impl Preds> { - defm "" : F_ATOMIC_3( - "int_nvvm_atomic_" # OpStr - # "_" # SpaceStr # "_" # IntTypeStr - # !if(!empty(ScopeStr), "", "_" # ScopeStr)), - preds = Preds>; -} // Constructs variants for different scopes of atomic op. multiclass ATOM2S_impl Preds> { - // No need to define ".gpu"-scoped atomics. They do the same thing - // as the regular, non-scoped atomics defined elsewhere. + +multiclass F_ATOMIC_3_INTRINSIC_PATTERN { foreach scope = ["cta", "sys"] in { - // For now we only need variants for generic space pointers. foreach space = ["gen"] in { - defm _#scope#space : ATOM3N_impl; + defvar intrinsic = !cast("int_nvvm_atomic_" # OpStr # "_" # space # "_i_" # scope); + def : Pat<(t.Ty (intrinsic addr:$addr, t.Ty:$b, t.Ty:$c)), + (!cast(InstructionName # "_rr") ADDR:$addr, t.Ty:$b, t.Ty:$c, Ordering_not_atomic, !cast("Scope_" # scope), !cast("AddrSpace_" # space))>; + + def : Pat<(t.Ty (intrinsic addr:$addr, (t.Ty t.ImmNode:$b), t.Ty:$c)), + (!cast(InstructionName # "_ir") ADDR:$addr, (t.Ty t.ImmNode:$b), t.Ty:$c, Ordering_not_atomic, !cast("Scope_" # scope), !cast("AddrSpace_" # space))>; + + def : Pat<(t.Ty (intrinsic addr:$addr, t.Ty:$b, (t.Ty t.ImmNode:$c))), + (!cast(InstructionName # "_ri") ADDR:$addr, t.Ty:$b, (t.Ty t.ImmNode:$c), Ordering_not_atomic, !cast("Scope_" # scope), !cast("AddrSpace_" # space))>; + + def : Pat<(t.Ty (intrinsic addr:$addr, (t.Ty t.ImmNode:$b), (t.Ty t.ImmNode:$c))), + (!cast(InstructionName # "_ii") ADDR:$addr, (t.Ty t.ImmNode:$b), (t.Ty t.ImmNode:$c), Ordering_not_atomic, !cast("Scope_" # scope), !cast("AddrSpace_" # space))>; } } } @@ -2069,9 +2099,9 @@ multiclass ATOM2_incdec_impl { // atom.cas multiclass ATOM3_cas_impl { - defm _b16 : ATOM3S_impl; - defm _b32 : ATOM3S_impl; - defm _b64 : ATOM3S_impl; + defm _b16 : F_ATOMIC_3_INTRINSIC_PATTERN; + defm _b32 : F_ATOMIC_3_INTRINSIC_PATTERN; + defm _b64 : F_ATOMIC_3_INTRINSIC_PATTERN; } defm INT_PTX_SATOM_ADD : ATOM2_add_impl<"add">; @@ -2137,7 +2167,7 @@ def LDU_GLOBAL_v4i32 : VLDU_G_ELE_V4<"b32", B32>; // during the lifetime of the kernel. class LDG_G - : NVPTXInst<(outs regclass:$result), (ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src), + : NVPTXInst<(outs regclass:$result), (ins AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$src), "ld.global.nc.${Sign:sign}$fromWidth \t$result, [$src];", []>; def LD_GLOBAL_NC_i8 : LDG_G; @@ -2150,19 +2180,19 @@ def LD_GLOBAL_NC_i64 : LDG_G; // Elementized vector ldg class VLDG_G_ELE_V2 : NVPTXInst<(outs regclass:$dst1, regclass:$dst2), - (ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src), + (ins AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$src), "ld.global.nc.v2.${Sign:sign}$fromWidth \t{{$dst1, $dst2}}, [$src];", []>; class VLDG_G_ELE_V4 : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src), + (ins AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$src), "ld.global.nc.v4.${Sign:sign}$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>; class VLDG_G_ELE_V8 : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4, regclass:$dst5, regclass:$dst6, regclass:$dst7, regclass:$dst8), - (ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src), + (ins AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$src), "ld.global.nc.v8.${Sign:sign}$fromWidth \t{{$dst1, $dst2, $dst3, $dst4, $dst5, $dst6, $dst7, $dst8}}, [$src];", []>; // FIXME: 8-bit LDG should be fixed once LDG/LDU nodes are made into proper loads. diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h index 88d3eefcc521e..4eb452f398220 100644 --- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h +++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h @@ -165,6 +165,8 @@ inline std::string ScopeToString(Scope S) { return "Cluster"; case Scope::Device: return "Device"; + case Scope::DefaultDevice: + return "DefaultDevice"; } report_fatal_error(formatv("Unknown NVPTX::Scope \"{}\".", static_cast(S))); diff --git a/llvm/test/CodeGen/NVPTX/atomicrmw-expand.ll b/llvm/test/CodeGen/NVPTX/atomicrmw-expand.ll index c7a0c60ae1f4d..94b3f0a2e1c3e 100644 --- a/llvm/test/CodeGen/NVPTX/atomicrmw-expand.ll +++ b/llvm/test/CodeGen/NVPTX/atomicrmw-expand.ll @@ -93,7 +93,8 @@ entry: %3 = atomicrmw or ptr %0, i8 %1 monotonic, align 1 ; ALL: atom.xor.b32 %4 = atomicrmw xor ptr %0, i8 %1 monotonic, align 1 - ; ALL: atom.cas.b32 + ; SM30: atom.cas.b32 + ; SM60: atom.sys.cas.b32 %5 = atomicrmw xchg ptr %0, i8 %1 monotonic, align 1 ret void } @@ -101,13 +102,17 @@ entry: ; CHECK-LABEL: minmax_i8 define void @minmax_i8(ptr %0, i8 %1) { entry: - ; ALL: atom.cas.b32 + ; SM30: atom.cas.b32 + ; SM60: atom.sys.cas.b32 %2 = atomicrmw min ptr %0, i8 %1 monotonic, align 1 - ; ALL: atom.cas.b32 + ; SM30: atom.cas.b32 + ; SM60: atom.sys.cas.b32 %3 = atomicrmw max ptr %0, i8 %1 monotonic, align 1 - ; ALL: atom.cas.b32 + ; SM30: atom.cas.b32 + ; SM60: atom.sys.cas.b32 %4 = atomicrmw umin ptr %0, i8 %1 monotonic, align 1 - ; ALL: atom.cas.b32 + ; SM30: atom.cas.b32 + ; SM60: atom.sys.cas.b32 %5 = atomicrmw umax ptr %0, i8 %1 monotonic, align 1 ret void } @@ -121,7 +126,8 @@ entry: %3 = atomicrmw or ptr %0, i16 %1 monotonic, align 2 ; ALL: atom.xor.b32 %4 = atomicrmw xor ptr %0, i16 %1 monotonic, align 2 - ; ALL: atom.cas.b32 + ; SM30: atom.cas.b32 + ; SM60: atom.sys.cas.b32 %5 = atomicrmw xchg ptr %0, i16 %1 monotonic, align 2 ret void } @@ -129,13 +135,17 @@ entry: ; CHECK-LABEL: minmax_i16 define void @minmax_i16(ptr %0, i16 %1) { entry: - ; ALL: atom.cas.b32 + ; SM30: atom.cas.b32 + ; SM60: atom.sys.cas.b32 %2 = atomicrmw min ptr %0, i16 %1 monotonic, align 2 - ; ALL: atom.cas.b32 + ; SM30: atom.cas.b32 + ; SM60: atom.sys.cas.b32 %3 = atomicrmw max ptr %0, i16 %1 monotonic, align 2 - ; ALL: atom.cas.b32 + ; SM30: atom.cas.b32 + ; SM60: atom.sys.cas.b32 %4 = atomicrmw umin ptr %0, i16 %1 monotonic, align 2 - ; ALL: atom.cas.b32 + ; SM30: atom.cas.b32 + ; SM60: atom.sys.cas.b32 %5 = atomicrmw umax ptr %0, i16 %1 monotonic, align 2 ret void } diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll index 94f49b01e6ea6..f710d7f883a1b 100644 --- a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll +++ b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll @@ -70,7 +70,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half % ; CHECKPTX62-NEXT: shl.b32 %r30, %r29, %r2; ; CHECKPTX62-NEXT: and.b32 %r31, %r54, %r3; ; CHECKPTX62-NEXT: or.b32 %r32, %r31, %r30; -; CHECKPTX62-NEXT: atom.cas.b32 %r6, [%r1], %r54, %r32; +; CHECKPTX62-NEXT: atom.relaxed.sys.cas.b32 %r6, [%r1], %r54, %r32; ; CHECKPTX62-NEXT: setp.ne.b32 %p1, %r6, %r54; ; CHECKPTX62-NEXT: mov.b32 %r54, %r6; ; CHECKPTX62-NEXT: @%p1 bra $L__BB0_1; @@ -86,7 +86,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half % ; CHECKPTX62-NEXT: shl.b32 %r35, %r34, %r2; ; CHECKPTX62-NEXT: and.b32 %r36, %r55, %r3; ; CHECKPTX62-NEXT: or.b32 %r37, %r36, %r35; -; CHECKPTX62-NEXT: atom.cas.b32 %r9, [%r1], %r55, %r37; +; CHECKPTX62-NEXT: atom.relaxed.sys.cas.b32 %r9, [%r1], %r55, %r37; ; CHECKPTX62-NEXT: setp.ne.b32 %p2, %r9, %r55; ; CHECKPTX62-NEXT: mov.b32 %r55, %r9; ; CHECKPTX62-NEXT: @%p2 bra $L__BB0_3; @@ -107,7 +107,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half % ; CHECKPTX62-NEXT: shl.b32 %r43, %r42, %r11; ; CHECKPTX62-NEXT: and.b32 %r44, %r56, %r12; ; CHECKPTX62-NEXT: or.b32 %r45, %r44, %r43; -; CHECKPTX62-NEXT: atom.global.cas.b32 %r15, [%r10], %r56, %r45; +; CHECKPTX62-NEXT: atom.relaxed.sys.global.cas.b32 %r15, [%r10], %r56, %r45; ; CHECKPTX62-NEXT: setp.ne.b32 %p3, %r15, %r56; ; CHECKPTX62-NEXT: mov.b32 %r56, %r15; ; CHECKPTX62-NEXT: @%p3 bra $L__BB0_5; @@ -128,7 +128,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half % ; CHECKPTX62-NEXT: shl.b32 %r51, %r50, %r17; ; CHECKPTX62-NEXT: and.b32 %r52, %r57, %r18; ; CHECKPTX62-NEXT: or.b32 %r53, %r52, %r51; -; CHECKPTX62-NEXT: atom.shared.cas.b32 %r21, [%r16], %r57, %r53; +; CHECKPTX62-NEXT: atom.relaxed.sys.shared.cas.b32 %r21, [%r16], %r57, %r53; ; CHECKPTX62-NEXT: setp.ne.b32 %p4, %r21, %r57; ; CHECKPTX62-NEXT: mov.b32 %r57, %r21; ; CHECKPTX62-NEXT: @%p4 bra $L__BB0_7; diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll index b21bd16d55c2c..f96fd30019025 100644 --- a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll +++ b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll @@ -71,7 +71,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat ; CHECKPTX71-NEXT: shl.b32 %r30, %r29, %r2; ; CHECKPTX71-NEXT: and.b32 %r31, %r54, %r3; ; CHECKPTX71-NEXT: or.b32 %r32, %r31, %r30; -; CHECKPTX71-NEXT: atom.relaxed.cas.b32 %r6, [%r1], %r54, %r32; +; CHECKPTX71-NEXT: atom.relaxed.sys.cas.b32 %r6, [%r1], %r54, %r32; ; CHECKPTX71-NEXT: setp.ne.b32 %p1, %r6, %r54; ; CHECKPTX71-NEXT: mov.b32 %r54, %r6; ; CHECKPTX71-NEXT: @%p1 bra $L__BB0_1; @@ -87,7 +87,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat ; CHECKPTX71-NEXT: shl.b32 %r35, %r34, %r2; ; CHECKPTX71-NEXT: and.b32 %r36, %r55, %r3; ; CHECKPTX71-NEXT: or.b32 %r37, %r36, %r35; -; CHECKPTX71-NEXT: atom.relaxed.cas.b32 %r9, [%r1], %r55, %r37; +; CHECKPTX71-NEXT: atom.relaxed.sys.cas.b32 %r9, [%r1], %r55, %r37; ; CHECKPTX71-NEXT: setp.ne.b32 %p2, %r9, %r55; ; CHECKPTX71-NEXT: mov.b32 %r55, %r9; ; CHECKPTX71-NEXT: @%p2 bra $L__BB0_3; @@ -109,7 +109,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat ; CHECKPTX71-NEXT: shl.b32 %r43, %r42, %r11; ; CHECKPTX71-NEXT: and.b32 %r44, %r56, %r12; ; CHECKPTX71-NEXT: or.b32 %r45, %r44, %r43; -; CHECKPTX71-NEXT: atom.relaxed.global.cas.b32 %r15, [%r10], %r56, %r45; +; CHECKPTX71-NEXT: atom.relaxed.sys.global.cas.b32 %r15, [%r10], %r56, %r45; ; CHECKPTX71-NEXT: setp.ne.b32 %p3, %r15, %r56; ; CHECKPTX71-NEXT: mov.b32 %r56, %r15; ; CHECKPTX71-NEXT: @%p3 bra $L__BB0_5; @@ -131,7 +131,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat ; CHECKPTX71-NEXT: shl.b32 %r51, %r50, %r17; ; CHECKPTX71-NEXT: and.b32 %r52, %r57, %r18; ; CHECKPTX71-NEXT: or.b32 %r53, %r52, %r51; -; CHECKPTX71-NEXT: atom.relaxed.shared.cas.b32 %r21, [%r16], %r57, %r53; +; CHECKPTX71-NEXT: atom.relaxed.sys.shared.cas.b32 %r21, [%r16], %r57, %r53; ; CHECKPTX71-NEXT: setp.ne.b32 %p4, %r21, %r57; ; CHECKPTX71-NEXT: mov.b32 %r57, %r21; ; CHECKPTX71-NEXT: @%p4 bra $L__BB0_7; diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll index 9f900c961d2ed..63c389c36e87e 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll @@ -2,8 +2,8 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_60 -mattr=+ptx50 | FileCheck %s --check-prefix=SM60 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_60 -mattr=+ptx50 | %ptxas-verify -arch=sm_60 %} -define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_generic( +define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11,10 +11,10 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_generic_param_1]; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_global_cta_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -25,13 +25,13 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: and.b32 %r15, %r14, 255; ; SM60-NEXT: shl.b32 %r3, %r15, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB0_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB0_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -43,12 +43,12 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: $L__BB0_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_global( +define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -56,10 +56,10 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_global_param_1]; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_global_cta_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -76,7 +76,7 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB1_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -86,14 +86,15 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB1_1; ; SM60-NEXT: $L__BB1_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_shared( +define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -101,10 +102,11 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_shared_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -115,13 +117,13 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: and.b32 %r15, %r14, 255; ; SM60-NEXT: shl.b32 %r3, %r15, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB2_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -131,14 +133,15 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB2_1; ; SM60-NEXT: $L__BB2_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_generic( +define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -146,10 +149,10 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_generic_param_1]; +; SM60-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_global_cta_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -160,13 +163,13 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: and.b32 %r15, %r14, 255; ; SM60-NEXT: shl.b32 %r3, %r15, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB3_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB3_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -176,15 +179,15 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB3_1; ; SM60-NEXT: $L__BB3_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_global( +define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -192,10 +195,10 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_global_param_1]; +; SM60-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_global_cta_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -212,7 +215,7 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB4_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -222,15 +225,15 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB4_1; ; SM60-NEXT: $L__BB4_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new } -define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_shared( +define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -238,10 +241,11 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_shared_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -252,13 +256,13 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: and.b32 %r15, %r14, 255; ; SM60-NEXT: shl.b32 %r3, %r15, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB5_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB5_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -268,15 +272,15 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB5_1; ; SM60-NEXT: $L__BB5_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst ret i8 %new } -define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_generic( +define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -284,10 +288,10 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_generic_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [release_monotonic_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -299,13 +303,13 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: and.b32 %r15, %r14, 255; ; SM60-NEXT: shl.b32 %r3, %r15, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB6_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -315,15 +319,14 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB6_1; ; SM60-NEXT: $L__BB6_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_global( +define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -331,10 +334,10 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_global_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [release_acquire_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -352,7 +355,7 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB7_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -362,15 +365,15 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB7_1; ; SM60-NEXT: $L__BB7_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire ret i8 %new } -define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_shared( +define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -378,10 +381,10 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_shared_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -393,13 +396,13 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: and.b32 %r15, %r14, 255; ; SM60-NEXT: shl.b32 %r3, %r15, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB8_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB8_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -409,15 +412,15 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB8_1; ; SM60-NEXT: $L__BB8_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst ret i8 %new } -define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_generic( +define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -425,10 +428,11 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_generic_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -439,13 +443,13 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: and.b32 %r15, %r14, 255; ; SM60-NEXT: shl.b32 %r3, %r15, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB9_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -455,15 +459,15 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB9_1; ; SM60-NEXT: $L__BB9_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic ret i8 %new } -define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_global( +define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -471,10 +475,11 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_global_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -491,7 +496,7 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB10_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -501,15 +506,15 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB10_1; ; SM60-NEXT: $L__BB10_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new } -define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_shared( +define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -517,10 +522,11 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_shared_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -531,13 +537,13 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: and.b32 %r15, %r14, 255; ; SM60-NEXT: shl.b32 %r3, %r15, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB11_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB11_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -547,15 +553,15 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB11_1; ; SM60-NEXT: $L__BB11_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst ret i8 %new } -define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_generic( +define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -563,10 +569,11 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_generic_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -577,13 +584,13 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: and.b32 %r15, %r14, 255; ; SM60-NEXT: shl.b32 %r3, %r15, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB12_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB12_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -593,15 +600,15 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB12_1; ; SM60-NEXT: $L__BB12_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic ret i8 %new } -define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_global( +define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -609,10 +616,11 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_global_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -629,7 +637,7 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB13_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -639,15 +647,15 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB13_1; ; SM60-NEXT: $L__BB13_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire ret i8 %new } -define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_shared( +define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -655,10 +663,11 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_shared_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -669,13 +678,13 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: and.b32 %r15, %r14, 255; ; SM60-NEXT: shl.b32 %r3, %r15, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB14_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB14_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -685,4996 +694,1429 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB14_1; ; SM60-NEXT: $L__BB14_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst ret i8 %new } -define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_generic( +define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<20>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_generic_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_cta_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: mov.b32 %r12, 65535; ; SM60-NEXT: shl.b32 %r13, %r12, %r1; ; SM60-NEXT: not.b32 %r2, %r13; ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB15_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB15_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB15_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB15_1; ; SM60-NEXT: $L__BB15_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic + ret i16 %new } -define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_global( +define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<20>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_global_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_cta_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: mov.b32 %r12, 65535; ; SM60-NEXT: shl.b32 %r13, %r12, %r1; ; SM60-NEXT: not.b32 %r2, %r13; ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB16_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB16_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB16_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB16_1; ; SM60-NEXT: $L__BB16_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire + ret i16 %new } -define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_shared( +define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<20>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_shared_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: mov.b32 %r12, 65535; ; SM60-NEXT: shl.b32 %r13, %r12, %r1; ; SM60-NEXT: not.b32 %r2, %r13; ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB17_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB17_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB17_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB17_1; ; SM60-NEXT: $L__BB17_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst + ret i16 %new } -define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_generic( +define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<20>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [release_monotonic_i8_generic_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_cta_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: mov.b32 %r12, 65535; ; SM60-NEXT: shl.b32 %r13, %r12, %r1; ; SM60-NEXT: not.b32 %r2, %r13; ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB18_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB18_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB18_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB18_1; ; SM60-NEXT: $L__BB18_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic + ret i16 %new } -define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_global( +define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<20>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [release_monotonic_i8_global_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_cta_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: mov.b32 %r12, 65535; ; SM60-NEXT: shl.b32 %r13, %r12, %r1; ; SM60-NEXT: not.b32 %r2, %r13; ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB19_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB19_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB19_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB19_1; ; SM60-NEXT: $L__BB19_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire + ret i16 %new } -define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_shared( +define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<20>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [release_monotonic_i8_shared_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: mov.b32 %r12, 65535; ; SM60-NEXT: shl.b32 %r13, %r12, %r1; ; SM60-NEXT: not.b32 %r2, %r13; ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB20_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB20_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB20_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB20_1; ; SM60-NEXT: $L__BB20_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst + ret i16 %new } -define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_generic( +define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<20>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [release_acquire_i8_generic_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: mov.b32 %r12, 65535; ; SM60-NEXT: shl.b32 %r13, %r12, %r1; ; SM60-NEXT: not.b32 %r2, %r13; ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB21_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB21_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB21_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB21_1; ; SM60-NEXT: $L__BB21_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic + ret i16 %new } -define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_global( +define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<20>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [release_acquire_i8_global_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: mov.b32 %r12, 65535; ; SM60-NEXT: shl.b32 %r13, %r12, %r1; ; SM60-NEXT: not.b32 %r2, %r13; ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB22_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB22_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB22_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB22_1; ; SM60-NEXT: $L__BB22_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire + ret i16 %new } -define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_shared( +define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<20>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [release_acquire_i8_shared_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: mov.b32 %r12, 65535; ; SM60-NEXT: shl.b32 %r13, %r12, %r1; ; SM60-NEXT: not.b32 %r2, %r13; ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB23_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB23_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB23_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB23_1; ; SM60-NEXT: $L__BB23_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst + ret i16 %new } -define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_generic( +define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<20>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_generic_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: mov.b32 %r12, 65535; ; SM60-NEXT: shl.b32 %r13, %r12, %r1; ; SM60-NEXT: not.b32 %r2, %r13; ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB24_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB24_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB24_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB24_1; ; SM60-NEXT: $L__BB24_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic + ret i16 %new } -define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_global( +define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<20>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_global_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: mov.b32 %r12, 65535; ; SM60-NEXT: shl.b32 %r13, %r12, %r1; ; SM60-NEXT: not.b32 %r2, %r13; ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB25_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB25_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB25_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB25_1; ; SM60-NEXT: $L__BB25_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire + ret i16 %new } -define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_shared( +define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<20>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_shared_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: mov.b32 %r12, 65535; ; SM60-NEXT: shl.b32 %r13, %r12, %r1; ; SM60-NEXT: not.b32 %r2, %r13; ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB26_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB26_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB26_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB26_1; ; SM60-NEXT: $L__BB26_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst + ret i16 %new } -define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_generic( +define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<20>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_generic_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: mov.b32 %r12, 65535; ; SM60-NEXT: shl.b32 %r13, %r12, %r1; ; SM60-NEXT: not.b32 %r2, %r13; ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB27_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB27_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB27_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB27_1; ; SM60-NEXT: $L__BB27_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic + ret i16 %new } -define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_global( +define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<20>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_global_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: mov.b32 %r12, 65535; ; SM60-NEXT: shl.b32 %r13, %r12, %r1; ; SM60-NEXT: not.b32 %r2, %r13; ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB28_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB28_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB28_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB28_1; ; SM60-NEXT: $L__BB28_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire + ret i16 %new } -define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_shared( +define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_seq_cst_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<20>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_shared_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: mov.b32 %r12, 65535; ; SM60-NEXT: shl.b32 %r13, %r12, %r1; ; SM60-NEXT: not.b32 %r2, %r13; ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB29_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB29_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB29_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB29_1; ; SM60-NEXT: $L__BB29_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst + ret i16 %new } -define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_generic( +define i32 @monotonic_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_monotonic_i32_global_cta( ; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_generic_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB30_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB30_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB30_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB30_1; -; SM60-NEXT: $L__BB30_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic + ret i32 %new } -define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_global( +define i32 @monotonic_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_acquire_i32_global_cta( ; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB31_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB31_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB31_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB31_1; -; SM60-NEXT: $L__BB31_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire + ret i32 %new } -define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_shared( +define i32 @monotonic_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_seq_cst_i32_global_cta( ; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_shared_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB32_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB32_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB32_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB32_1; -; SM60-NEXT: $L__BB32_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_generic_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB33_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB33_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB33_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB33_1; -; SM60-NEXT: $L__BB33_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB34_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB34_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB34_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB34_1; -; SM60-NEXT: $L__BB34_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_shared_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB35_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB35_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB35_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB35_1; -; SM60-NEXT: $L__BB35_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_generic_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB36_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB36_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB36_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB36_1; -; SM60-NEXT: $L__BB36_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_global_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB37_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB37_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB37_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB37_1; -; SM60-NEXT: $L__BB37_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_shared_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB38_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB38_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB38_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB38_1; -; SM60-NEXT: $L__BB38_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_generic_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB39_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB39_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB39_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB39_1; -; SM60-NEXT: $L__BB39_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_global_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB40_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB40_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB40_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB40_1; -; SM60-NEXT: $L__BB40_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_shared_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB41_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB41_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB41_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB41_1; -; SM60-NEXT: $L__BB41_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_generic_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB42_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB42_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB42_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB42_1; -; SM60-NEXT: $L__BB42_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB43_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB43_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB43_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB43_1; -; SM60-NEXT: $L__BB43_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_shared_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB44_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB44_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB44_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB44_1; -; SM60-NEXT: $L__BB44_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst - ret i8 %new -} - -define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_monotonic_i16_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB45_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB45_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB45_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB45_1; -; SM60-NEXT: $L__BB45_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_monotonic_i16_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB46_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB46_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB46_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB46_1; -; SM60-NEXT: $L__BB46_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_monotonic_i16_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB47_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB47_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB47_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB47_1; -; SM60-NEXT: $L__BB47_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_acquire_i16_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB48_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB48_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB48_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB48_1; -; SM60-NEXT: $L__BB48_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_acquire_i16_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB49_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB49_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB49_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB49_1; -; SM60-NEXT: $L__BB49_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_acquire_i16_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB50_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB50_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB50_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB50_1; -; SM60-NEXT: $L__BB50_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic acquire - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_seq_cst_i16_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB51_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB51_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB51_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB51_1; -; SM60-NEXT: $L__BB51_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_seq_cst_i16_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB52_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB52_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB52_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB52_1; -; SM60-NEXT: $L__BB52_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_seq_cst_i16_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB53_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB53_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB53_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB53_1; -; SM60-NEXT: $L__BB53_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic seq_cst - ret i16 %new -} - -define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_monotonic_i16_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB54_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB54_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB54_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB54_1; -; SM60-NEXT: $L__BB54_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire monotonic - ret i16 %new -} - -define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_monotonic_i16_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB55_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB55_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB55_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB55_1; -; SM60-NEXT: $L__BB55_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire monotonic - ret i16 %new -} - -define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_monotonic_i16_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB56_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB56_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB56_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB56_1; -; SM60-NEXT: $L__BB56_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire monotonic - ret i16 %new -} - -define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_acquire_i16_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB57_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB57_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB57_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB57_1; -; SM60-NEXT: $L__BB57_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire acquire - ret i16 %new -} - -define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_acquire_i16_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB58_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB58_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB58_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB58_1; -; SM60-NEXT: $L__BB58_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire acquire - ret i16 %new -} - -define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_acquire_i16_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB59_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB59_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB59_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB59_1; -; SM60-NEXT: $L__BB59_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire acquire - ret i16 %new -} - -define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_seq_cst_i16_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB60_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB60_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB60_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB60_1; -; SM60-NEXT: $L__BB60_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire seq_cst - ret i16 %new -} - -define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_seq_cst_i16_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB61_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB61_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB61_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB61_1; -; SM60-NEXT: $L__BB61_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire seq_cst - ret i16 %new -} - -define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_seq_cst_i16_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB62_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB62_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB62_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB62_1; -; SM60-NEXT: $L__BB62_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire seq_cst - ret i16 %new -} - -define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_monotonic_i16_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB63_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB63_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB63_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB63_1; -; SM60-NEXT: $L__BB63_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release monotonic - ret i16 %new -} - -define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_monotonic_i16_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB64_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB64_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB64_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB64_1; -; SM60-NEXT: $L__BB64_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release monotonic - ret i16 %new -} - -define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_monotonic_i16_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB65_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB65_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB65_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB65_1; -; SM60-NEXT: $L__BB65_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release monotonic - ret i16 %new -} - -define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_acquire_i16_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB66_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB66_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB66_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB66_1; -; SM60-NEXT: $L__BB66_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release acquire - ret i16 %new -} - -define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_acquire_i16_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB67_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB67_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB67_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB67_1; -; SM60-NEXT: $L__BB67_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release acquire - ret i16 %new -} - -define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_acquire_i16_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB68_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB68_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB68_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB68_1; -; SM60-NEXT: $L__BB68_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release acquire - ret i16 %new -} - -define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_seq_cst_i16_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB69_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB69_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB69_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB69_1; -; SM60-NEXT: $L__BB69_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release seq_cst - ret i16 %new -} - -define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_seq_cst_i16_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB70_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB70_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB70_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB70_1; -; SM60-NEXT: $L__BB70_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release seq_cst - ret i16 %new -} - -define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_seq_cst_i16_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB71_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB71_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB71_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB71_1; -; SM60-NEXT: $L__BB71_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release seq_cst - ret i16 %new -} - -define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_monotonic_i16_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB72_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB72_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB72_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB72_1; -; SM60-NEXT: $L__BB72_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel monotonic - ret i16 %new -} - -define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_monotonic_i16_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB73_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB73_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB73_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB73_1; -; SM60-NEXT: $L__BB73_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel monotonic - ret i16 %new -} - -define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_monotonic_i16_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB74_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB74_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB74_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB74_1; -; SM60-NEXT: $L__BB74_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel monotonic - ret i16 %new -} - -define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_acquire_i16_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB75_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB75_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB75_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB75_1; -; SM60-NEXT: $L__BB75_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel acquire - ret i16 %new -} - -define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_acquire_i16_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB76_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB76_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB76_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB76_1; -; SM60-NEXT: $L__BB76_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel acquire - ret i16 %new -} - -define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_acquire_i16_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB77_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB77_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB77_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB77_1; -; SM60-NEXT: $L__BB77_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel acquire - ret i16 %new -} - -define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_seq_cst_i16_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB78_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB78_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB78_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB78_1; -; SM60-NEXT: $L__BB78_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel seq_cst - ret i16 %new -} - -define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_seq_cst_i16_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB79_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB79_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB79_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB79_1; -; SM60-NEXT: $L__BB79_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel seq_cst - ret i16 %new -} - -define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_seq_cst_i16_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB80_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB80_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB80_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB80_1; -; SM60-NEXT: $L__BB80_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel seq_cst - ret i16 %new -} - -define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_monotonic_i16_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB81_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB81_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB81_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB81_1; -; SM60-NEXT: $L__BB81_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst monotonic - ret i16 %new -} - -define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_monotonic_i16_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB82_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB82_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB82_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB82_1; -; SM60-NEXT: $L__BB82_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst monotonic - ret i16 %new -} - -define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_monotonic_i16_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB83_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB83_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB83_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB83_1; -; SM60-NEXT: $L__BB83_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst monotonic - ret i16 %new -} - -define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_acquire_i16_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB84_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB84_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB84_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB84_1; -; SM60-NEXT: $L__BB84_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst acquire - ret i16 %new -} - -define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_acquire_i16_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB85_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB85_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB85_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB85_1; -; SM60-NEXT: $L__BB85_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst acquire - ret i16 %new -} - -define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_acquire_i16_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB86_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB86_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB86_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB86_1; -; SM60-NEXT: $L__BB86_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst acquire - ret i16 %new -} - -define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_seq_cst_i16_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB87_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB87_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB87_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB87_1; -; SM60-NEXT: $L__BB87_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst - ret i16 %new -} - -define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_seq_cst_i16_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB88_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB88_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB88_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB88_1; -; SM60-NEXT: $L__BB88_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst seq_cst - ret i16 %new -} - -define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_seq_cst_i16_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB89_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB89_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB89_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB89_1; -; SM60-NEXT: $L__BB89_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst seq_cst - ret i16 %new -} - -define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_monotonic_i32_generic( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic - ret i32 %new -} - -define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_monotonic_i32_global( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic monotonic - ret i32 %new -} - -define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_monotonic_i32_shared( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic monotonic - ret i32 %new -} - -define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_acquire_i32_generic( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic acquire - ret i32 %new -} - -define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_acquire_i32_global( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic acquire - ret i32 %new -} - -define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_acquire_i32_shared( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic acquire - ret i32 %new -} - -define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_seq_cst_i32_generic( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic seq_cst - ret i32 %new -} - -define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_seq_cst_i32_global( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic seq_cst - ret i32 %new -} - -define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_seq_cst_i32_shared( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic seq_cst - ret i32 %new -} - -define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_monotonic_i32_generic( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire monotonic - ret i32 %new -} - -define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_monotonic_i32_global( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire monotonic - ret i32 %new -} - -define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_monotonic_i32_shared( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire monotonic - ret i32 %new -} - -define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_acquire_i32_generic( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire acquire - ret i32 %new -} - -define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_acquire_i32_global( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire acquire - ret i32 %new -} - -define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_acquire_i32_shared( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire acquire - ret i32 %new -} - -define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_seq_cst_i32_generic( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire seq_cst - ret i32 %new -} - -define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_seq_cst_i32_global( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire seq_cst - ret i32 %new -} - -define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_seq_cst_i32_shared( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire seq_cst - ret i32 %new -} - -define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_monotonic_i32_generic( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release monotonic - ret i32 %new -} - -define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_monotonic_i32_global( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release monotonic - ret i32 %new -} - -define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_monotonic_i32_shared( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release monotonic - ret i32 %new -} - -define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_acquire_i32_generic( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release acquire - ret i32 %new -} - -define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_acquire_i32_global( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release acquire - ret i32 %new -} - -define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_acquire_i32_shared( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release acquire - ret i32 %new -} - -define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_seq_cst_i32_generic( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release seq_cst - ret i32 %new -} - -define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_seq_cst_i32_global( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release seq_cst - ret i32 %new -} - -define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_seq_cst_i32_shared( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release seq_cst - ret i32 %new -} - -define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_monotonic_i32_generic( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel monotonic - ret i32 %new -} - -define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_monotonic_i32_global( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel monotonic - ret i32 %new -} - -define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_monotonic_i32_shared( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel monotonic - ret i32 %new -} - -define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_acquire_i32_generic( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel acquire - ret i32 %new -} - -define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_acquire_i32_global( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire - ret i32 %new -} - -define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_acquire_i32_shared( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel acquire - ret i32 %new -} - -define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_seq_cst_i32_generic( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel seq_cst - ret i32 %new -} - -define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_seq_cst_i32_global( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel seq_cst - ret i32 %new -} - -define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_seq_cst_i32_shared( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel seq_cst - ret i32 %new -} - -define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_monotonic_i32_generic( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst monotonic - ret i32 %new -} - -define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_monotonic_i32_global( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst monotonic - ret i32 %new -} - -define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_monotonic_i32_shared( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst monotonic - ret i32 %new -} - -define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_acquire_i32_generic( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst acquire - ret i32 %new -} - -define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_acquire_i32_global( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst acquire - ret i32 %new -} - -define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_acquire_i32_shared( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst acquire - ret i32 %new -} - -define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_seq_cst_i32_generic( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst - ret i32 %new -} - -define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_seq_cst_i32_global( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst seq_cst - ret i32 %new -} - -define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_seq_cst_i32_shared( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst seq_cst - ret i32 %new -} - -define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_monotonic_i64_generic( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic - ret i64 %new -} - -define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_monotonic_i64_global( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic monotonic - ret i64 %new -} - -define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_monotonic_i64_shared( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic monotonic - ret i64 %new -} - -define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_acquire_i64_generic( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic acquire - ret i64 %new -} - -define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_acquire_i64_global( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic acquire - ret i64 %new -} - -define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_acquire_i64_shared( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic acquire - ret i64 %new -} - -define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_seq_cst_i64_generic( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic seq_cst - ret i64 %new -} - -define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_seq_cst_i64_global( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic seq_cst - ret i64 %new -} - -define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_seq_cst_i64_shared( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic seq_cst - ret i64 %new -} - -define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_monotonic_i64_generic( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst + ret i32 %new } -define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_monotonic_i64_global( +define i32 @acquire_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_monotonic_i32_global_cta( ; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic + ret i32 %new } -define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_monotonic_i64_shared( +define i32 @acquire_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_acquire_i32_global_cta( ; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire + ret i32 %new } -define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_acquire_i64_generic( +define i32 @acquire_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_seq_cst_i32_global_cta( ; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire acquire - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst + ret i32 %new } -define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_acquire_i64_global( +define i32 @release_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_monotonic_i32_global_cta( ; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire acquire - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic + ret i32 %new } -define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_acquire_i64_shared( +define i32 @release_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_acquire_i32_global_cta( ; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire acquire - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release acquire + ret i32 %new } -define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_seq_cst_i64_generic( +define i32 @release_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_seq_cst_i32_global_cta( ; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire seq_cst - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst + ret i32 %new } -define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_seq_cst_i64_global( +define i32 @acq_rel_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_monotonic_i32_global_cta( ; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire seq_cst - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic + ret i32 %new } -define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_seq_cst_i64_shared( +define i32 @acq_rel_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_global_cta( ; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire seq_cst - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire + ret i32 %new } -define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_monotonic_i64_generic( +define i32 @acq_rel_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_seq_cst_i32_global_cta( ; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst + ret i32 %new } -define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_monotonic_i64_global( +define i32 @seq_cst_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_monotonic_i32_global_cta( ; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic + ret i32 %new } -define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_monotonic_i64_shared( +define i32 @seq_cst_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_acquire_i32_global_cta( ; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire + ret i32 %new } -define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_acquire_i64_generic( +define i32 @seq_cst_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_seq_cst_i32_global_cta( ; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release acquire - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst + ret i32 %new } -define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_acquire_i64_global( +define i64 @monotonic_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_monotonic_i64_global_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic ret i64 %new } -define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_acquire_i64_shared( +define i64 @monotonic_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_acquire_i64_global_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire ret i64 %new } -define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_seq_cst_i64_generic( +define i64 @monotonic_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_seq_cst_i64_global_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst ret i64 %new } -define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_seq_cst_i64_global( +define i64 @acquire_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_monotonic_i64_global_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic ret i64 %new } -define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_seq_cst_i64_shared( +define i64 @acquire_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_acquire_i64_global_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire ret i64 %new } -define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_monotonic_i64_generic( +define i64 @acquire_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_seq_cst_i64_global_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst ret i64 %new } -define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_monotonic_i64_global( +define i64 @release_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_monotonic_i64_global_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic ret i64 %new } -define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_monotonic_i64_shared( +define i64 @release_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_acquire_i64_global_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release acquire ret i64 %new } -define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_acquire_i64_generic( +define i64 @release_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_seq_cst_i64_global_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst ret i64 %new } -define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_acquire_i64_global( +define i64 @acq_rel_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_monotonic_i64_global_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic ret i64 %new } -define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_acquire_i64_shared( +define i64 @acq_rel_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_acquire_i64_global_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire ret i64 %new } -define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_seq_cst_i64_generic( +define i64 @acq_rel_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_seq_cst_i64_global_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst ret i64 %new } -define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_seq_cst_i64_global( +define i64 @seq_cst_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_monotonic_i64_global_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic ret i64 %new } -define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_seq_cst_i64_shared( +define i64 @seq_cst_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_acquire_i64_global_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire ret i64 %new } -define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_monotonic_i64_generic( +define i64 @seq_cst_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_seq_cst_i64_global_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_monotonic_i64_global( +define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_global( ; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB60_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB60_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB60_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB60_1; +; SM60-NEXT: $L__BB60_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire + ret i8 %new } -define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_monotonic_i64_shared( +define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_global( ; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_param_2]; +; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire + ret i32 %new } -define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_acquire_i64_generic( +define i32 @acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_global_sys( ; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_sys_param_2]; +; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst acquire - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire + ret i32 %new } -define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_acquire_i64_global( +define i32 @acq_rel_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_global_gpu( ; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_gpu_param_2]; +; SM60-NEXT: atom.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst acquire - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire + ret i32 %new } -define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_acquire_i64_shared( +define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_generic_cta( ; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_generic_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB64_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB64_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB64_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB64_1; +; SM60-NEXT: $L__BB64_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst acquire - ret i64 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + ret i8 %new } -define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_seq_cst_i64_generic( +define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_shared_cta( ; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_shared_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB65_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB65_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB65_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB65_1; +; SM60-NEXT: $L__BB65_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst - ret i64 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + ret i8 %new } -define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_seq_cst_i64_global( +define i32 @acq_rel_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_generic_cta( ; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst seq_cst - ret i64 %new + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire + ret i32 %new } -define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_seq_cst_i64_shared( +define i32 @acq_rel_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_shared_cta( ; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_cta_param_2]; +; SM60-NEXT: atom.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst seq_cst - ret i64 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire + ret i32 %new } diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll index 28b258dc2a868..5cb344d5ded84 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll @@ -2,8 +2,8 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | FileCheck %s --check-prefix=SM70 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %} -define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_monotonic_i8_generic( +define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -11,10 +11,10 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_generic_param_1]; +; SM70-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_global_cta_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -25,13 +25,13 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: and.b32 %r15, %r14, 255; ; SM70-NEXT: shl.b32 %r3, %r15, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB0_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB0_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -43,12 +43,12 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: $L__BB0_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_monotonic_i8_global( +define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -56,10 +56,10 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_global_param_1]; +; SM70-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_global_cta_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -76,7 +76,7 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB1_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -86,14 +86,15 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB1_1; ; SM70-NEXT: $L__BB1_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_monotonic_i8_shared( +define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -101,10 +102,11 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_shared_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -115,13 +117,13 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: and.b32 %r15, %r14, 255; ; SM70-NEXT: shl.b32 %r3, %r15, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB2_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -131,14 +133,15 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB2_1; ; SM70-NEXT: $L__BB2_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_acquire_i8_generic( +define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -146,10 +149,10 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_generic_param_1]; +; SM70-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_global_cta_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -160,13 +163,13 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: and.b32 %r15, %r14, 255; ; SM70-NEXT: shl.b32 %r3, %r15, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB3_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB3_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -176,15 +179,15 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB3_1; ; SM70-NEXT: $L__BB3_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_acquire_i8_global( +define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -192,10 +195,10 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_global_param_1]; +; SM70-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_global_cta_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -212,7 +215,7 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB4_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -222,15 +225,15 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB4_1; ; SM70-NEXT: $L__BB4_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new } -define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_acquire_i8_shared( +define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -238,10 +241,11 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_shared_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -252,13 +256,13 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: and.b32 %r15, %r14, 255; ; SM70-NEXT: shl.b32 %r3, %r15, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB5_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB5_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -268,15 +272,15 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB5_1; ; SM70-NEXT: $L__BB5_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst ret i8 %new } -define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_seq_cst_i8_generic( +define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -284,10 +288,10 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_generic_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b8 %r9, [release_monotonic_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -299,13 +303,13 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: and.b32 %r15, %r14, 255; ; SM70-NEXT: shl.b32 %r3, %r15, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB6_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -315,15 +319,14 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB6_1; ; SM70-NEXT: $L__BB6_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_seq_cst_i8_global( +define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -331,10 +334,10 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_global_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b8 %r9, [release_acquire_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -352,7 +355,7 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB7_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -362,15 +365,15 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB7_1; ; SM70-NEXT: $L__BB7_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire ret i8 %new } -define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_seq_cst_i8_shared( +define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -378,10 +381,10 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_shared_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -393,13 +396,13 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: and.b32 %r15, %r14, 255; ; SM70-NEXT: shl.b32 %r3, %r15, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB8_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB8_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -409,15 +412,15 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB8_1; ; SM70-NEXT: $L__BB8_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst ret i8 %new } -define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_monotonic_i8_generic( +define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -425,10 +428,11 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_generic_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -439,13 +443,13 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: and.b32 %r15, %r14, 255; ; SM70-NEXT: shl.b32 %r3, %r15, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB9_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -455,15 +459,15 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB9_1; ; SM70-NEXT: $L__BB9_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic ret i8 %new } -define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_monotonic_i8_global( +define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -471,10 +475,11 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_global_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -491,7 +496,7 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB10_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -501,15 +506,15 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB10_1; ; SM70-NEXT: $L__BB10_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new } -define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_monotonic_i8_shared( +define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -517,10 +522,11 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_shared_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -531,13 +537,13 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: and.b32 %r15, %r14, 255; ; SM70-NEXT: shl.b32 %r3, %r15, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB11_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB11_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -547,15 +553,15 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB11_1; ; SM70-NEXT: $L__BB11_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst ret i8 %new } -define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_acquire_i8_generic( +define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -563,10 +569,11 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_generic_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -577,13 +584,13 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: and.b32 %r15, %r14, 255; ; SM70-NEXT: shl.b32 %r3, %r15, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB12_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB12_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -593,15 +600,15 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB12_1; ; SM70-NEXT: $L__BB12_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic ret i8 %new } -define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_acquire_i8_global( +define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -609,10 +616,11 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_global_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -629,7 +637,7 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB13_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -639,15 +647,15 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB13_1; ; SM70-NEXT: $L__BB13_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire ret i8 %new } -define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_acquire_i8_shared( +define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -655,10 +663,11 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_shared_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -669,13 +678,13 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: and.b32 %r15, %r14, 255; ; SM70-NEXT: shl.b32 %r3, %r15, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB14_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB14_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -685,4996 +694,1429 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB14_1; ; SM70-NEXT: $L__BB14_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst ret i8 %new } -define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_seq_cst_i8_generic( +define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<20>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_generic_param_1]; +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_cta_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: mov.b32 %r12, 65535; ; SM70-NEXT: shl.b32 %r13, %r12, %r1; ; SM70-NEXT: not.b32 %r2, %r13; ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB15_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB15_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB15_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB15_1; ; SM70-NEXT: $L__BB15_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic + ret i16 %new } -define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_seq_cst_i8_global( +define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<20>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_global_param_1]; +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_cta_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: mov.b32 %r12, 65535; ; SM70-NEXT: shl.b32 %r13, %r12, %r1; ; SM70-NEXT: not.b32 %r2, %r13; ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB16_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB16_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB16_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB16_1; ; SM70-NEXT: $L__BB16_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire + ret i16 %new } -define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_seq_cst_i8_shared( +define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<20>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_shared_param_1]; +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: mov.b32 %r12, 65535; ; SM70-NEXT: shl.b32 %r13, %r12, %r1; ; SM70-NEXT: not.b32 %r2, %r13; ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB17_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB17_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB17_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB17_1; ; SM70-NEXT: $L__BB17_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst + ret i16 %new } -define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_monotonic_i8_generic( +define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<20>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b8 %r9, [release_monotonic_i8_generic_param_1]; +; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_cta_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: mov.b32 %r12, 65535; ; SM70-NEXT: shl.b32 %r13, %r12, %r1; ; SM70-NEXT: not.b32 %r2, %r13; ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB18_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB18_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB18_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB18_1; ; SM70-NEXT: $L__BB18_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic + ret i16 %new } -define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_monotonic_i8_global( +define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<20>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b8 %r9, [release_monotonic_i8_global_param_1]; +; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_cta_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: mov.b32 %r12, 65535; ; SM70-NEXT: shl.b32 %r13, %r12, %r1; ; SM70-NEXT: not.b32 %r2, %r13; ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB19_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB19_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB19_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB19_1; ; SM70-NEXT: $L__BB19_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire + ret i16 %new } -define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_monotonic_i8_shared( +define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<20>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b8 %r9, [release_monotonic_i8_shared_param_1]; +; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: mov.b32 %r12, 65535; ; SM70-NEXT: shl.b32 %r13, %r12, %r1; ; SM70-NEXT: not.b32 %r2, %r13; ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB20_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB20_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB20_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB20_1; ; SM70-NEXT: $L__BB20_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst + ret i16 %new } -define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_acquire_i8_generic( +define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<20>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b8 %r9, [release_acquire_i8_generic_param_1]; +; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: mov.b32 %r12, 65535; ; SM70-NEXT: shl.b32 %r13, %r12, %r1; ; SM70-NEXT: not.b32 %r2, %r13; ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB21_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB21_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB21_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB21_1; ; SM70-NEXT: $L__BB21_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic + ret i16 %new } -define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_acquire_i8_global( +define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<20>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b8 %r9, [release_acquire_i8_global_param_1]; +; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: mov.b32 %r12, 65535; ; SM70-NEXT: shl.b32 %r13, %r12, %r1; ; SM70-NEXT: not.b32 %r2, %r13; ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB22_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB22_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB22_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB22_1; ; SM70-NEXT: $L__BB22_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire + ret i16 %new } -define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_acquire_i8_shared( +define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<20>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b8 %r9, [release_acquire_i8_shared_param_1]; +; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: mov.b32 %r12, 65535; ; SM70-NEXT: shl.b32 %r13, %r12, %r1; ; SM70-NEXT: not.b32 %r2, %r13; ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB23_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB23_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB23_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB23_1; ; SM70-NEXT: $L__BB23_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst + ret i16 %new } -define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_seq_cst_i8_generic( +define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<20>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_generic_param_1]; +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: mov.b32 %r12, 65535; ; SM70-NEXT: shl.b32 %r13, %r12, %r1; ; SM70-NEXT: not.b32 %r2, %r13; ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB24_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB24_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB24_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB24_1; ; SM70-NEXT: $L__BB24_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic + ret i16 %new } -define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_seq_cst_i8_global( +define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<20>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_global_param_1]; +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: mov.b32 %r12, 65535; ; SM70-NEXT: shl.b32 %r13, %r12, %r1; ; SM70-NEXT: not.b32 %r2, %r13; ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB25_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB25_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB25_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB25_1; ; SM70-NEXT: $L__BB25_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire + ret i16 %new } -define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_seq_cst_i8_shared( +define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<20>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_shared_param_1]; +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: mov.b32 %r12, 65535; ; SM70-NEXT: shl.b32 %r13, %r12, %r1; ; SM70-NEXT: not.b32 %r2, %r13; ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB26_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB26_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB26_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB26_1; ; SM70-NEXT: $L__BB26_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst + ret i16 %new } -define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_monotonic_i8_generic( +define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<20>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_generic_param_1]; +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: mov.b32 %r12, 65535; ; SM70-NEXT: shl.b32 %r13, %r12, %r1; ; SM70-NEXT: not.b32 %r2, %r13; ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB27_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB27_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB27_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB27_1; ; SM70-NEXT: $L__BB27_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic + ret i16 %new } -define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_monotonic_i8_global( +define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_acquire_i16_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<20>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_global_param_1]; +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: mov.b32 %r12, 65535; ; SM70-NEXT: shl.b32 %r13, %r12, %r1; ; SM70-NEXT: not.b32 %r2, %r13; ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB28_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB28_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB28_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB28_1; ; SM70-NEXT: $L__BB28_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire + ret i16 %new } -define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_monotonic_i8_shared( +define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_seq_cst_i16_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<20>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_shared_param_1]; +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: mov.b32 %r12, 65535; ; SM70-NEXT: shl.b32 %r13, %r12, %r1; ; SM70-NEXT: not.b32 %r2, %r13; ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB29_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB29_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB29_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB29_1; ; SM70-NEXT: $L__BB29_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic - ret i8 %new -} - -define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_acquire_i8_generic( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_generic_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB30_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB30_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB30_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB30_1; -; SM70-NEXT: $L__BB30_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire - ret i8 %new -} - -define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_acquire_i8_global( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB31_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB31_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB31_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB31_1; -; SM70-NEXT: $L__BB31_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire - ret i8 %new -} - -define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_acquire_i8_shared( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_shared_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB32_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB32_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB32_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB32_1; -; SM70-NEXT: $L__BB32_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst + ret i16 %new } -define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_seq_cst_i8_generic( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_generic_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB33_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB33_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB33_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB33_1; -; SM70-NEXT: $L__BB33_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_seq_cst_i8_global( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB34_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB34_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB34_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB34_1; -; SM70-NEXT: $L__BB34_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_seq_cst_i8_shared( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_shared_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB35_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB35_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB35_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB35_1; -; SM70-NEXT: $L__BB35_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_monotonic_i8_generic( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_generic_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB36_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB36_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB36_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB36_1; -; SM70-NEXT: $L__BB36_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_monotonic_i8_global( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_global_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB37_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB37_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB37_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB37_1; -; SM70-NEXT: $L__BB37_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_monotonic_i8_shared( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_shared_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB38_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB38_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB38_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB38_1; -; SM70-NEXT: $L__BB38_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_acquire_i8_generic( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_generic_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB39_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB39_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB39_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB39_1; -; SM70-NEXT: $L__BB39_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_acquire_i8_global( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_global_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB40_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB40_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB40_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB40_1; -; SM70-NEXT: $L__BB40_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_acquire_i8_shared( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_shared_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB41_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB41_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB41_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB41_1; -; SM70-NEXT: $L__BB41_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_seq_cst_i8_generic( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_generic_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB42_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB42_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB42_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB42_1; -; SM70-NEXT: $L__BB42_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_seq_cst_i8_global( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB43_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB43_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB43_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB43_1; -; SM70-NEXT: $L__BB43_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_seq_cst_i8_shared( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_shared_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB44_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB44_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB44_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB44_1; -; SM70-NEXT: $L__BB44_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst - ret i8 %new -} - -define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_monotonic_i16_generic( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB45_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB45_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB45_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB45_1; -; SM70-NEXT: $L__BB45_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_monotonic_i16_global( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB46_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB46_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB46_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB46_1; -; SM70-NEXT: $L__BB46_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_monotonic_i16_shared( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB47_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB47_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB47_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB47_1; -; SM70-NEXT: $L__BB47_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_acquire_i16_generic( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB48_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB48_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB48_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB48_1; -; SM70-NEXT: $L__BB48_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_acquire_i16_global( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB49_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB49_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB49_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB49_1; -; SM70-NEXT: $L__BB49_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_acquire_i16_shared( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB50_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB50_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB50_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB50_1; -; SM70-NEXT: $L__BB50_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic acquire - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_seq_cst_i16_generic( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB51_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB51_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB51_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB51_1; -; SM70-NEXT: $L__BB51_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_seq_cst_i16_global( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB52_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB52_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB52_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB52_1; -; SM70-NEXT: $L__BB52_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_seq_cst_i16_shared( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB53_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB53_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB53_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB53_1; -; SM70-NEXT: $L__BB53_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic seq_cst - ret i16 %new -} - -define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_monotonic_i16_generic( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB54_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB54_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB54_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB54_1; -; SM70-NEXT: $L__BB54_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire monotonic - ret i16 %new -} - -define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_monotonic_i16_global( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB55_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB55_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB55_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB55_1; -; SM70-NEXT: $L__BB55_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire monotonic - ret i16 %new -} - -define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_monotonic_i16_shared( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB56_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB56_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB56_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB56_1; -; SM70-NEXT: $L__BB56_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire monotonic - ret i16 %new -} - -define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_acquire_i16_generic( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB57_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB57_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB57_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB57_1; -; SM70-NEXT: $L__BB57_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire acquire - ret i16 %new -} - -define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_acquire_i16_global( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB58_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB58_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB58_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB58_1; -; SM70-NEXT: $L__BB58_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire acquire - ret i16 %new -} - -define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_acquire_i16_shared( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB59_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB59_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB59_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB59_1; -; SM70-NEXT: $L__BB59_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire acquire - ret i16 %new -} - -define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_seq_cst_i16_generic( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB60_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB60_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB60_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB60_1; -; SM70-NEXT: $L__BB60_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire seq_cst - ret i16 %new -} - -define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_seq_cst_i16_global( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB61_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB61_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB61_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB61_1; -; SM70-NEXT: $L__BB61_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire seq_cst - ret i16 %new -} - -define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_seq_cst_i16_shared( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB62_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB62_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB62_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB62_1; -; SM70-NEXT: $L__BB62_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire seq_cst - ret i16 %new -} - -define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_monotonic_i16_generic( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB63_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB63_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB63_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB63_1; -; SM70-NEXT: $L__BB63_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release monotonic - ret i16 %new -} - -define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_monotonic_i16_global( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB64_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB64_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB64_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB64_1; -; SM70-NEXT: $L__BB64_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release monotonic - ret i16 %new -} - -define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_monotonic_i16_shared( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB65_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB65_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB65_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB65_1; -; SM70-NEXT: $L__BB65_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release monotonic - ret i16 %new -} - -define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_acquire_i16_generic( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB66_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB66_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB66_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB66_1; -; SM70-NEXT: $L__BB66_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release acquire - ret i16 %new -} - -define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_acquire_i16_global( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB67_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB67_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB67_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB67_1; -; SM70-NEXT: $L__BB67_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release acquire - ret i16 %new -} - -define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_acquire_i16_shared( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB68_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB68_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB68_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB68_1; -; SM70-NEXT: $L__BB68_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release acquire - ret i16 %new -} - -define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_seq_cst_i16_generic( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB69_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB69_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB69_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB69_1; -; SM70-NEXT: $L__BB69_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release seq_cst - ret i16 %new -} - -define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_seq_cst_i16_global( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB70_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB70_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB70_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB70_1; -; SM70-NEXT: $L__BB70_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release seq_cst - ret i16 %new -} - -define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_seq_cst_i16_shared( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB71_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB71_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB71_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB71_1; -; SM70-NEXT: $L__BB71_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release seq_cst - ret i16 %new -} - -define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_monotonic_i16_generic( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB72_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB72_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB72_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB72_1; -; SM70-NEXT: $L__BB72_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel monotonic - ret i16 %new -} - -define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_monotonic_i16_global( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB73_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB73_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB73_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB73_1; -; SM70-NEXT: $L__BB73_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel monotonic - ret i16 %new -} - -define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_monotonic_i16_shared( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB74_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB74_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB74_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB74_1; -; SM70-NEXT: $L__BB74_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel monotonic - ret i16 %new -} - -define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_acquire_i16_generic( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB75_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB75_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB75_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB75_1; -; SM70-NEXT: $L__BB75_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel acquire - ret i16 %new -} - -define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_acquire_i16_global( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB76_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB76_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB76_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB76_1; -; SM70-NEXT: $L__BB76_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel acquire - ret i16 %new -} - -define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_acquire_i16_shared( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB77_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB77_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB77_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB77_1; -; SM70-NEXT: $L__BB77_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel acquire - ret i16 %new -} - -define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_seq_cst_i16_generic( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB78_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB78_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB78_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB78_1; -; SM70-NEXT: $L__BB78_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel seq_cst - ret i16 %new -} - -define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_seq_cst_i16_global( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB79_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB79_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB79_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB79_1; -; SM70-NEXT: $L__BB79_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel seq_cst - ret i16 %new -} - -define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_seq_cst_i16_shared( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB80_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB80_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB80_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB80_1; -; SM70-NEXT: $L__BB80_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel seq_cst - ret i16 %new -} - -define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_monotonic_i16_generic( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB81_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB81_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB81_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB81_1; -; SM70-NEXT: $L__BB81_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst monotonic - ret i16 %new -} - -define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_monotonic_i16_global( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB82_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB82_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB82_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB82_1; -; SM70-NEXT: $L__BB82_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst monotonic - ret i16 %new -} - -define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_monotonic_i16_shared( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB83_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB83_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB83_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB83_1; -; SM70-NEXT: $L__BB83_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst monotonic - ret i16 %new -} - -define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_acquire_i16_generic( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB84_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB84_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB84_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB84_1; -; SM70-NEXT: $L__BB84_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst acquire - ret i16 %new -} - -define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_acquire_i16_global( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB85_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB85_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB85_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB85_1; -; SM70-NEXT: $L__BB85_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst acquire - ret i16 %new -} - -define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_acquire_i16_shared( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB86_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB86_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB86_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB86_1; -; SM70-NEXT: $L__BB86_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst acquire - ret i16 %new -} - -define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_seq_cst_i16_generic( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB87_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB87_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB87_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB87_1; -; SM70-NEXT: $L__BB87_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst - ret i16 %new -} - -define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_seq_cst_i16_global( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB88_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB88_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB88_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB88_1; -; SM70-NEXT: $L__BB88_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst seq_cst - ret i16 %new -} - -define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_seq_cst_i16_shared( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB89_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB89_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB89_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB89_1; -; SM70-NEXT: $L__BB89_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst seq_cst - ret i16 %new -} - -define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_monotonic_i32_generic( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_param_2]; -; SM70-NEXT: atom.relaxed.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic - ret i32 %new -} - -define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_monotonic_i32_global( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_param_2]; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic monotonic - ret i32 %new -} - -define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_monotonic_i32_shared( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_param_2]; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic monotonic - ret i32 %new -} - -define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_acquire_i32_generic( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic acquire - ret i32 %new -} - -define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_acquire_i32_global( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic acquire - ret i32 %new -} - -define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_acquire_i32_shared( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic acquire - ret i32 %new -} - -define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_seq_cst_i32_generic( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic seq_cst - ret i32 %new -} - -define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_seq_cst_i32_global( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic seq_cst - ret i32 %new -} - -define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_seq_cst_i32_shared( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic seq_cst - ret i32 %new -} - -define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_monotonic_i32_generic( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire monotonic - ret i32 %new -} - -define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_monotonic_i32_global( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire monotonic - ret i32 %new -} - -define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_monotonic_i32_shared( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire monotonic - ret i32 %new -} - -define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_acquire_i32_generic( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire acquire - ret i32 %new -} - -define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_acquire_i32_global( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire acquire - ret i32 %new -} - -define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_acquire_i32_shared( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire acquire - ret i32 %new -} - -define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_seq_cst_i32_generic( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire seq_cst - ret i32 %new -} - -define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_seq_cst_i32_global( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire seq_cst - ret i32 %new -} - -define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_seq_cst_i32_shared( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire seq_cst - ret i32 %new -} - -define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_monotonic_i32_generic( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_param_2]; -; SM70-NEXT: atom.release.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release monotonic - ret i32 %new -} - -define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_monotonic_i32_global( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_param_2]; -; SM70-NEXT: atom.release.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release monotonic - ret i32 %new -} - -define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_monotonic_i32_shared( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_param_2]; -; SM70-NEXT: atom.release.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release monotonic - ret i32 %new -} - -define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_acquire_i32_generic( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_param_2]; -; SM70-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release acquire - ret i32 %new -} - -define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_acquire_i32_global( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_param_2]; -; SM70-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release acquire - ret i32 %new -} - -define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_acquire_i32_shared( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_param_2]; -; SM70-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release acquire - ret i32 %new -} - -define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_seq_cst_i32_generic( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release seq_cst - ret i32 %new -} - -define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_seq_cst_i32_global( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release seq_cst - ret i32 %new -} - -define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_seq_cst_i32_shared( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release seq_cst - ret i32 %new -} - -define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_monotonic_i32_generic( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_param_2]; -; SM70-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel monotonic - ret i32 %new -} - -define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_monotonic_i32_global( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_param_2]; -; SM70-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel monotonic - ret i32 %new -} - -define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_monotonic_i32_shared( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_param_2]; -; SM70-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel monotonic - ret i32 %new -} - -define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_acquire_i32_generic( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_param_2]; -; SM70-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel acquire - ret i32 %new -} - -define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_acquire_i32_global( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_param_2]; -; SM70-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire - ret i32 %new -} - -define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_acquire_i32_shared( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_param_2]; -; SM70-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel acquire - ret i32 %new -} - -define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_seq_cst_i32_generic( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel seq_cst - ret i32 %new -} - -define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_seq_cst_i32_global( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel seq_cst - ret i32 %new -} - -define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_seq_cst_i32_shared( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel seq_cst - ret i32 %new -} - -define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_monotonic_i32_generic( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst monotonic - ret i32 %new -} - -define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_monotonic_i32_global( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst monotonic - ret i32 %new -} - -define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_monotonic_i32_shared( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst monotonic - ret i32 %new -} - -define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_acquire_i32_generic( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst acquire - ret i32 %new -} - -define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_acquire_i32_global( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst acquire - ret i32 %new -} - -define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_acquire_i32_shared( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst acquire - ret i32 %new -} - -define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_seq_cst_i32_generic( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst - ret i32 %new -} - -define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_seq_cst_i32_global( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst seq_cst - ret i32 %new -} - -define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_seq_cst_i32_shared( +define i32 @monotonic_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_monotonic_i32_global_cta( ; SM70: { ; SM70-NEXT: .reg .b32 %r<4>; ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst seq_cst - ret i32 %new -} - -define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_monotonic_i64_generic( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_param_2]; -; SM70-NEXT: atom.relaxed.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic - ret i64 %new -} - -define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_monotonic_i64_global( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_param_2]; -; SM70-NEXT: atom.relaxed.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic monotonic - ret i64 %new -} - -define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_monotonic_i64_shared( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_param_2]; -; SM70-NEXT: atom.relaxed.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic monotonic - ret i64 %new -} - -define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_acquire_i64_generic( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic acquire - ret i64 %new -} - -define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_acquire_i64_global( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic acquire - ret i64 %new -} - -define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_acquire_i64_shared( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic acquire - ret i64 %new -} - -define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_seq_cst_i64_generic( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic seq_cst - ret i64 %new -} - -define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_seq_cst_i64_global( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_cta_param_2]; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic seq_cst - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic + ret i32 %new } -define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_seq_cst_i64_shared( +define i32 @monotonic_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_acquire_i32_global_cta( ; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic seq_cst - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire + ret i32 %new } -define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_monotonic_i64_generic( +define i32 @monotonic_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_seq_cst_i32_global_cta( ; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst + ret i32 %new } -define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_monotonic_i64_global( +define i32 @acquire_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_monotonic_i32_global_cta( ; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic + ret i32 %new } -define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_monotonic_i64_shared( +define i32 @acquire_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_acquire_i32_global_cta( ; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire + ret i32 %new } -define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_acquire_i64_generic( +define i32 @acquire_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_seq_cst_i32_global_cta( ; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire acquire - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst + ret i32 %new } -define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_acquire_i64_global( +define i32 @release_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_monotonic_i32_global_cta( ; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_cta_param_2]; +; SM70-NEXT: atom.release.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire acquire - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic + ret i32 %new } -define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_acquire_i64_shared( +define i32 @release_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_acquire_i32_global_cta( ; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire acquire - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release acquire + ret i32 %new } -define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_seq_cst_i64_generic( +define i32 @release_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_seq_cst_i32_global_cta( ; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire seq_cst - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst + ret i32 %new } -define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_seq_cst_i64_global( +define i32 @acq_rel_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_monotonic_i32_global_cta( ; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire seq_cst - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic + ret i32 %new } -define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_seq_cst_i64_shared( +define i32 @acq_rel_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_global_cta( ; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire seq_cst - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire + ret i32 %new } -define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_monotonic_i64_generic( +define i32 @acq_rel_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_seq_cst_i32_global_cta( ; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_param_2]; -; SM70-NEXT: atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst + ret i32 %new } -define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_monotonic_i64_global( +define i32 @seq_cst_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_monotonic_i32_global_cta( ; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_param_2]; -; SM70-NEXT: atom.release.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic + ret i32 %new } -define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_monotonic_i64_shared( +define i32 @seq_cst_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_acquire_i32_global_cta( ; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_param_2]; -; SM70-NEXT: atom.release.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire + ret i32 %new } -define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_acquire_i64_generic( +define i32 @seq_cst_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_seq_cst_i32_global_cta( ; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_param_2]; -; SM70-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release acquire - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst + ret i32 %new } -define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_acquire_i64_global( +define i64 @monotonic_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_monotonic_i64_global_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_param_2]; -; SM70-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_cta_param_2]; +; SM70-NEXT: atom.relaxed.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic ret i64 %new } -define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_acquire_i64_shared( +define i64 @monotonic_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_acquire_i64_global_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_param_2]; -; SM70-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire ret i64 %new } -define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_seq_cst_i64_generic( +define i64 @monotonic_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_seq_cst_i64_global_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst ret i64 %new } -define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_seq_cst_i64_global( +define i64 @acquire_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_monotonic_i64_global_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic ret i64 %new } -define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_seq_cst_i64_shared( +define i64 @acquire_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_acquire_i64_global_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire ret i64 %new } -define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_monotonic_i64_generic( +define i64 @acquire_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_seq_cst_i64_global_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_param_2]; -; SM70-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst ret i64 %new } -define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_monotonic_i64_global( +define i64 @release_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_monotonic_i64_global_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_param_2]; -; SM70-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_cta_param_2]; +; SM70-NEXT: atom.release.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic ret i64 %new } -define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_monotonic_i64_shared( +define i64 @release_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_acquire_i64_global_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_param_2]; -; SM70-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release acquire ret i64 %new } -define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_acquire_i64_generic( +define i64 @release_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_seq_cst_i64_global_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_param_2]; -; SM70-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst ret i64 %new } -define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_acquire_i64_global( +define i64 @acq_rel_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_monotonic_i64_global_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_param_2]; -; SM70-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic ret i64 %new } -define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_acquire_i64_shared( +define i64 @acq_rel_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_acquire_i64_global_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_param_2]; -; SM70-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire ret i64 %new } -define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_seq_cst_i64_generic( +define i64 @acq_rel_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_seq_cst_i64_global_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst ret i64 %new } -define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_seq_cst_i64_global( +define i64 @seq_cst_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_monotonic_i64_global_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic ret i64 %new } -define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_seq_cst_i64_shared( +define i64 @seq_cst_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_acquire_i64_global_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire ret i64 %new } -define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_monotonic_i64_generic( +define i64 @seq_cst_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_seq_cst_i64_global_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_monotonic_i64_global( +define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_global( ; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB60_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB60_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB60_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB60_1; +; SM70-NEXT: $L__BB60_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire + ret i8 %new } -define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_monotonic_i64_shared( +define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_global( ; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_param_2]; +; SM70-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire + ret i32 %new } -define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_acquire_i64_generic( +define i32 @acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_global_sys( ; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst acquire - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire + ret i32 %new } -define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_acquire_i64_global( +define i32 @acq_rel_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_global_gpu( ; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst acquire - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire + ret i32 %new } -define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_acquire_i64_shared( +define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_generic_cta( ; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_generic_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB64_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB64_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB64_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB64_1; +; SM70-NEXT: $L__BB64_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst acquire - ret i64 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + ret i8 %new } -define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_seq_cst_i64_generic( +define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_shared_cta( ; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_shared_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB65_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB65_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB65_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB65_1; +; SM70-NEXT: $L__BB65_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst - ret i64 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + ret i8 %new } -define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_seq_cst_i64_global( +define i32 @acq_rel_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_generic_cta( ; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst seq_cst - ret i64 %new + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire + ret i32 %new } -define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_seq_cst_i64_shared( +define i32 @acq_rel_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_shared_cta( ; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst seq_cst - ret i64 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire + ret i32 %new } diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll index 368fe3f036c9e..7cb259023d6dd 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll @@ -2,8 +2,8 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | FileCheck %s --check-prefix=SM90 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %} -define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_generic( +define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -11,10 +11,10 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_generic_param_1]; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_global_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -25,13 +25,13 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: and.b32 %r15, %r14, 255; ; SM90-NEXT: shl.b32 %r3, %r15, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB0_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB0_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -43,12 +43,12 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: $L__BB0_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_global( +define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -56,10 +56,10 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_global_param_1]; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_global_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -76,7 +76,7 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB1_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -86,14 +86,15 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB1_1; ; SM90-NEXT: $L__BB1_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_shared( +define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -101,10 +102,11 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_shared_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -115,13 +117,13 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: and.b32 %r15, %r14, 255; ; SM90-NEXT: shl.b32 %r3, %r15, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB2_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -131,14 +133,15 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB2_1; ; SM90-NEXT: $L__BB2_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_generic( +define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -146,10 +149,10 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_generic_param_1]; +; SM90-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_global_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -160,13 +163,13 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: and.b32 %r15, %r14, 255; ; SM90-NEXT: shl.b32 %r3, %r15, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB3_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB3_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -176,15 +179,15 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB3_1; ; SM90-NEXT: $L__BB3_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_global( +define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -192,10 +195,10 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_global_param_1]; +; SM90-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_global_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -212,7 +215,7 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB4_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -222,15 +225,15 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB4_1; ; SM90-NEXT: $L__BB4_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new } -define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_shared( +define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -238,10 +241,11 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_shared_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -252,13 +256,13 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: and.b32 %r15, %r14, 255; ; SM90-NEXT: shl.b32 %r3, %r15, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB5_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB5_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -268,15 +272,15 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB5_1; ; SM90-NEXT: $L__BB5_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst ret i8 %new } -define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_generic( +define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -284,10 +288,10 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_generic_param_1]; +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b8 %r9, [release_monotonic_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -299,13 +303,13 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: and.b32 %r15, %r14, 255; ; SM90-NEXT: shl.b32 %r3, %r15, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB6_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -315,15 +319,14 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB6_1; ; SM90-NEXT: $L__BB6_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_global( +define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -331,10 +334,10 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_global_param_1]; +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b8 %r9, [release_acquire_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -352,7 +355,7 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB7_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -362,15 +365,15 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB7_1; ; SM90-NEXT: $L__BB7_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire ret i8 %new } -define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_shared( +define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -378,10 +381,10 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_shared_param_1]; +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -393,13 +396,13 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: and.b32 %r15, %r14, 255; ; SM90-NEXT: shl.b32 %r3, %r15, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB8_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB8_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -409,15 +412,15 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB8_1; ; SM90-NEXT: $L__BB8_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst ret i8 %new } -define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_generic( +define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -425,10 +428,11 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_generic_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -439,13 +443,13 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: and.b32 %r15, %r14, 255; ; SM90-NEXT: shl.b32 %r3, %r15, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB9_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -455,15 +459,15 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB9_1; ; SM90-NEXT: $L__BB9_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic ret i8 %new } -define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_global( +define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -471,10 +475,11 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_global_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -491,7 +496,7 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB10_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -501,15 +506,15 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB10_1; ; SM90-NEXT: $L__BB10_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new } -define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_shared( +define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -517,10 +522,11 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_shared_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -531,13 +537,13 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: and.b32 %r15, %r14, 255; ; SM90-NEXT: shl.b32 %r3, %r15, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB11_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB11_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -547,15 +553,15 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB11_1; ; SM90-NEXT: $L__BB11_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst ret i8 %new } -define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_generic( +define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -563,10 +569,11 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_generic_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -577,13 +584,13 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: and.b32 %r15, %r14, 255; ; SM90-NEXT: shl.b32 %r3, %r15, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB12_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB12_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -593,15 +600,15 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB12_1; ; SM90-NEXT: $L__BB12_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic ret i8 %new } -define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_global( +define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -609,10 +616,11 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_global_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -629,7 +637,7 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB13_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -639,15 +647,15 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB13_1; ; SM90-NEXT: $L__BB13_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire ret i8 %new } -define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_shared( +define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -655,10 +663,11 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_shared_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -669,13 +678,13 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: and.b32 %r15, %r14, 255; ; SM90-NEXT: shl.b32 %r3, %r15, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB14_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB14_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -685,4996 +694,1446 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB14_1; ; SM90-NEXT: $L__BB14_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst ret i8 %new } -define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_generic( +define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<20>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_generic_param_1]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: mov.b32 %r12, 65535; ; SM90-NEXT: shl.b32 %r13, %r12, %r1; ; SM90-NEXT: not.b32 %r2, %r13; ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB15_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB15_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB15_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB15_1; ; SM90-NEXT: $L__BB15_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic + ret i16 %new } -define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_global( +define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<20>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_global_param_1]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: mov.b32 %r12, 65535; ; SM90-NEXT: shl.b32 %r13, %r12, %r1; ; SM90-NEXT: not.b32 %r2, %r13; ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB16_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB16_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB16_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB16_1; ; SM90-NEXT: $L__BB16_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire + ret i16 %new } -define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_shared( +define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<20>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_shared_param_1]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: mov.b32 %r12, 65535; ; SM90-NEXT: shl.b32 %r13, %r12, %r1; ; SM90-NEXT: not.b32 %r2, %r13; ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB17_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB17_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB17_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB17_1; ; SM90-NEXT: $L__BB17_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst + ret i16 %new } -define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_generic( +define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<20>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b8 %r9, [release_monotonic_i8_generic_param_1]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: mov.b32 %r12, 65535; ; SM90-NEXT: shl.b32 %r13, %r12, %r1; ; SM90-NEXT: not.b32 %r2, %r13; ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB18_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB18_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB18_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB18_1; ; SM90-NEXT: $L__BB18_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic + ret i16 %new } -define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_global( +define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<20>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b8 %r9, [release_monotonic_i8_global_param_1]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: mov.b32 %r12, 65535; ; SM90-NEXT: shl.b32 %r13, %r12, %r1; ; SM90-NEXT: not.b32 %r2, %r13; ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB19_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB19_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB19_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB19_1; ; SM90-NEXT: $L__BB19_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire + ret i16 %new } -define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_shared( +define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<20>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b8 %r9, [release_monotonic_i8_shared_param_1]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: mov.b32 %r12, 65535; ; SM90-NEXT: shl.b32 %r13, %r12, %r1; ; SM90-NEXT: not.b32 %r2, %r13; ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB20_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB20_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB20_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB20_1; ; SM90-NEXT: $L__BB20_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst + ret i16 %new } -define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_generic( +define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<20>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b8 %r9, [release_acquire_i8_generic_param_1]; +; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: mov.b32 %r12, 65535; ; SM90-NEXT: shl.b32 %r13, %r12, %r1; ; SM90-NEXT: not.b32 %r2, %r13; ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB21_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB21_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB21_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB21_1; ; SM90-NEXT: $L__BB21_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic + ret i16 %new } -define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_global( +define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<20>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b8 %r9, [release_acquire_i8_global_param_1]; +; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: mov.b32 %r12, 65535; ; SM90-NEXT: shl.b32 %r13, %r12, %r1; ; SM90-NEXT: not.b32 %r2, %r13; ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB22_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB22_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB22_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB22_1; ; SM90-NEXT: $L__BB22_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire + ret i16 %new } -define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_shared( +define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<20>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b8 %r9, [release_acquire_i8_shared_param_1]; +; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: mov.b32 %r12, 65535; ; SM90-NEXT: shl.b32 %r13, %r12, %r1; ; SM90-NEXT: not.b32 %r2, %r13; ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB23_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB23_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB23_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB23_1; ; SM90-NEXT: $L__BB23_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst + ret i16 %new } -define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_generic( +define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<20>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_generic_param_1]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: mov.b32 %r12, 65535; ; SM90-NEXT: shl.b32 %r13, %r12, %r1; ; SM90-NEXT: not.b32 %r2, %r13; ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB24_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB24_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB24_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB24_1; ; SM90-NEXT: $L__BB24_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic + ret i16 %new } -define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_global( +define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<20>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_global_param_1]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: mov.b32 %r12, 65535; ; SM90-NEXT: shl.b32 %r13, %r12, %r1; ; SM90-NEXT: not.b32 %r2, %r13; ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB25_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB25_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB25_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB25_1; ; SM90-NEXT: $L__BB25_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire + ret i16 %new } -define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_shared( +define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<20>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_shared_param_1]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: mov.b32 %r12, 65535; ; SM90-NEXT: shl.b32 %r13, %r12, %r1; ; SM90-NEXT: not.b32 %r2, %r13; ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB26_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB26_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB26_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB26_1; ; SM90-NEXT: $L__BB26_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst + ret i16 %new } -define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_generic( +define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<20>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_generic_param_1]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: mov.b32 %r12, 65535; ; SM90-NEXT: shl.b32 %r13, %r12, %r1; ; SM90-NEXT: not.b32 %r2, %r13; ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB27_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB27_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB27_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB27_1; ; SM90-NEXT: $L__BB27_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic + ret i16 %new } -define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_global( +define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<20>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_global_param_1]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: mov.b32 %r12, 65535; ; SM90-NEXT: shl.b32 %r13, %r12, %r1; ; SM90-NEXT: not.b32 %r2, %r13; ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB28_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB28_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB28_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB28_1; ; SM90-NEXT: $L__BB28_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire + ret i16 %new } -define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_shared( +define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<20>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_shared_param_1]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: mov.b32 %r12, 65535; ; SM90-NEXT: shl.b32 %r13, %r12, %r1; ; SM90-NEXT: not.b32 %r2, %r13; ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB29_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB29_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB29_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB29_1; ; SM90-NEXT: $L__BB29_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic - ret i8 %new -} - -define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_generic( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_generic_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB30_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB30_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB30_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB30_1; -; SM90-NEXT: $L__BB30_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire - ret i8 %new -} - -define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_global( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB31_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB31_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB31_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB31_1; -; SM90-NEXT: $L__BB31_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst + ret i16 %new } -define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_shared( +define i32 @monotonic_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_global_cta( ; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_shared_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB32_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB32_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB32_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB32_1; -; SM90-NEXT: $L__BB32_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_cta_param_2]; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic + ret i32 %new } -define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_generic( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_generic_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB33_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB33_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB33_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB33_1; -; SM90-NEXT: $L__BB33_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_global( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB34_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB34_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB34_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB34_1; -; SM90-NEXT: $L__BB34_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_shared( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_shared_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB35_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB35_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB35_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB35_1; -; SM90-NEXT: $L__BB35_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_generic( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_generic_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB36_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB36_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB36_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB36_1; -; SM90-NEXT: $L__BB36_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_global( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_global_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB37_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB37_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB37_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB37_1; -; SM90-NEXT: $L__BB37_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_shared( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_shared_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB38_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB38_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB38_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB38_1; -; SM90-NEXT: $L__BB38_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_generic( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_generic_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB39_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB39_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB39_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB39_1; -; SM90-NEXT: $L__BB39_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_global( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_global_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB40_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB40_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB40_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB40_1; -; SM90-NEXT: $L__BB40_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_shared( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_shared_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB41_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB41_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB41_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB41_1; -; SM90-NEXT: $L__BB41_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_generic( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_generic_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB42_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB42_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB42_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB42_1; -; SM90-NEXT: $L__BB42_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_global( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB43_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB43_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB43_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB43_1; -; SM90-NEXT: $L__BB43_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_shared( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_shared_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB44_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB44_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB44_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB44_1; -; SM90-NEXT: $L__BB44_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst - ret i8 %new -} - -define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_monotonic_i16_generic( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB45_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB45_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB45_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB45_1; -; SM90-NEXT: $L__BB45_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_monotonic_i16_global( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB46_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB46_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB46_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB46_1; -; SM90-NEXT: $L__BB46_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_monotonic_i16_shared( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB47_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB47_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB47_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB47_1; -; SM90-NEXT: $L__BB47_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_acquire_i16_generic( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB48_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB48_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB48_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB48_1; -; SM90-NEXT: $L__BB48_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_acquire_i16_global( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB49_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB49_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB49_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB49_1; -; SM90-NEXT: $L__BB49_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_acquire_i16_shared( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB50_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB50_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB50_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB50_1; -; SM90-NEXT: $L__BB50_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic acquire - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_seq_cst_i16_generic( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB51_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB51_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB51_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB51_1; -; SM90-NEXT: $L__BB51_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_seq_cst_i16_global( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB52_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB52_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB52_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB52_1; -; SM90-NEXT: $L__BB52_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_seq_cst_i16_shared( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB53_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB53_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB53_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB53_1; -; SM90-NEXT: $L__BB53_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic seq_cst - ret i16 %new -} - -define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_monotonic_i16_generic( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB54_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB54_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB54_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB54_1; -; SM90-NEXT: $L__BB54_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire monotonic - ret i16 %new -} - -define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_monotonic_i16_global( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB55_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB55_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB55_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB55_1; -; SM90-NEXT: $L__BB55_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire monotonic - ret i16 %new -} - -define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_monotonic_i16_shared( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB56_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB56_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB56_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB56_1; -; SM90-NEXT: $L__BB56_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire monotonic - ret i16 %new -} - -define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_acquire_i16_generic( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB57_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB57_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB57_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB57_1; -; SM90-NEXT: $L__BB57_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire acquire - ret i16 %new -} - -define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_acquire_i16_global( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB58_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB58_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB58_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB58_1; -; SM90-NEXT: $L__BB58_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire acquire - ret i16 %new -} - -define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_acquire_i16_shared( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB59_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB59_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB59_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB59_1; -; SM90-NEXT: $L__BB59_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire acquire - ret i16 %new -} - -define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_seq_cst_i16_generic( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB60_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB60_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB60_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB60_1; -; SM90-NEXT: $L__BB60_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire seq_cst - ret i16 %new -} - -define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_seq_cst_i16_global( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB61_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB61_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB61_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB61_1; -; SM90-NEXT: $L__BB61_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire seq_cst - ret i16 %new -} - -define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_seq_cst_i16_shared( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB62_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB62_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB62_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB62_1; -; SM90-NEXT: $L__BB62_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire seq_cst - ret i16 %new -} - -define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_monotonic_i16_generic( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB63_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB63_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB63_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB63_1; -; SM90-NEXT: $L__BB63_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release monotonic - ret i16 %new -} - -define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_monotonic_i16_global( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB64_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB64_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB64_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB64_1; -; SM90-NEXT: $L__BB64_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release monotonic - ret i16 %new -} - -define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_monotonic_i16_shared( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB65_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB65_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB65_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB65_1; -; SM90-NEXT: $L__BB65_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release monotonic - ret i16 %new -} - -define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_acquire_i16_generic( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB66_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB66_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB66_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB66_1; -; SM90-NEXT: $L__BB66_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release acquire - ret i16 %new -} - -define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_acquire_i16_global( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB67_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB67_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB67_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB67_1; -; SM90-NEXT: $L__BB67_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release acquire - ret i16 %new -} - -define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_acquire_i16_shared( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB68_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB68_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB68_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB68_1; -; SM90-NEXT: $L__BB68_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release acquire - ret i16 %new -} - -define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_seq_cst_i16_generic( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB69_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB69_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB69_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB69_1; -; SM90-NEXT: $L__BB69_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release seq_cst - ret i16 %new -} - -define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_seq_cst_i16_global( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB70_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB70_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB70_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB70_1; -; SM90-NEXT: $L__BB70_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release seq_cst - ret i16 %new -} - -define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_seq_cst_i16_shared( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB71_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB71_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB71_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB71_1; -; SM90-NEXT: $L__BB71_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release seq_cst - ret i16 %new -} - -define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_monotonic_i16_generic( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB72_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB72_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB72_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB72_1; -; SM90-NEXT: $L__BB72_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel monotonic - ret i16 %new -} - -define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_monotonic_i16_global( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB73_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB73_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB73_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB73_1; -; SM90-NEXT: $L__BB73_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel monotonic - ret i16 %new -} - -define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_monotonic_i16_shared( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB74_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB74_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB74_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB74_1; -; SM90-NEXT: $L__BB74_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel monotonic - ret i16 %new -} - -define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_acquire_i16_generic( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB75_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB75_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB75_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB75_1; -; SM90-NEXT: $L__BB75_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel acquire - ret i16 %new -} - -define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_acquire_i16_global( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB76_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB76_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB76_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB76_1; -; SM90-NEXT: $L__BB76_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel acquire - ret i16 %new -} - -define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_acquire_i16_shared( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB77_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB77_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB77_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB77_1; -; SM90-NEXT: $L__BB77_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel acquire - ret i16 %new -} - -define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_seq_cst_i16_generic( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB78_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB78_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB78_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB78_1; -; SM90-NEXT: $L__BB78_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel seq_cst - ret i16 %new -} - -define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_seq_cst_i16_global( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB79_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB79_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB79_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB79_1; -; SM90-NEXT: $L__BB79_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel seq_cst - ret i16 %new -} - -define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_seq_cst_i16_shared( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB80_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB80_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB80_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB80_1; -; SM90-NEXT: $L__BB80_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel seq_cst - ret i16 %new -} - -define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_monotonic_i16_generic( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB81_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB81_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB81_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB81_1; -; SM90-NEXT: $L__BB81_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst monotonic - ret i16 %new -} - -define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_monotonic_i16_global( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB82_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB82_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB82_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB82_1; -; SM90-NEXT: $L__BB82_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst monotonic - ret i16 %new -} - -define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_monotonic_i16_shared( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB83_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB83_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB83_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB83_1; -; SM90-NEXT: $L__BB83_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst monotonic - ret i16 %new -} - -define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_acquire_i16_generic( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB84_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB84_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB84_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB84_1; -; SM90-NEXT: $L__BB84_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst acquire - ret i16 %new -} - -define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_acquire_i16_global( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB85_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB85_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB85_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB85_1; -; SM90-NEXT: $L__BB85_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst acquire - ret i16 %new -} - -define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_acquire_i16_shared( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB86_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB86_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB86_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB86_1; -; SM90-NEXT: $L__BB86_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst acquire - ret i16 %new -} - -define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_seq_cst_i16_generic( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB87_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB87_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB87_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB87_1; -; SM90-NEXT: $L__BB87_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst - ret i16 %new -} - -define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_seq_cst_i16_global( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB88_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB88_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB88_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB88_1; -; SM90-NEXT: $L__BB88_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst seq_cst - ret i16 %new -} - -define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_seq_cst_i16_shared( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB89_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB89_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB89_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB89_1; -; SM90-NEXT: $L__BB89_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst seq_cst - ret i16 %new -} - -define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_monotonic_i32_generic( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_param_2]; -; SM90-NEXT: atom.relaxed.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic - ret i32 %new -} - -define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_monotonic_i32_global( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_param_2]; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic monotonic - ret i32 %new -} - -define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_monotonic_i32_shared( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_param_2]; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic monotonic - ret i32 %new -} - -define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_acquire_i32_generic( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic acquire - ret i32 %new -} - -define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_acquire_i32_global( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic acquire - ret i32 %new -} - -define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_acquire_i32_shared( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic acquire - ret i32 %new -} - -define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_seq_cst_i32_generic( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic seq_cst - ret i32 %new -} - -define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_seq_cst_i32_global( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic seq_cst - ret i32 %new -} - -define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_seq_cst_i32_shared( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic seq_cst - ret i32 %new -} - -define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_monotonic_i32_generic( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire monotonic - ret i32 %new -} - -define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_monotonic_i32_global( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire monotonic - ret i32 %new -} - -define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_monotonic_i32_shared( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire monotonic - ret i32 %new -} - -define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_acquire_i32_generic( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire acquire - ret i32 %new -} - -define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_acquire_i32_global( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire acquire - ret i32 %new -} - -define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_acquire_i32_shared( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire acquire - ret i32 %new -} - -define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_seq_cst_i32_generic( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire seq_cst - ret i32 %new -} - -define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_seq_cst_i32_global( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire seq_cst - ret i32 %new -} - -define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_seq_cst_i32_shared( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire seq_cst - ret i32 %new -} - -define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_monotonic_i32_generic( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_param_2]; -; SM90-NEXT: atom.release.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release monotonic - ret i32 %new -} - -define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_monotonic_i32_global( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_param_2]; -; SM90-NEXT: atom.release.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release monotonic - ret i32 %new -} - -define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_monotonic_i32_shared( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_param_2]; -; SM90-NEXT: atom.release.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release monotonic - ret i32 %new -} - -define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_acquire_i32_generic( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_param_2]; -; SM90-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release acquire - ret i32 %new -} - -define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_acquire_i32_global( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_param_2]; -; SM90-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release acquire - ret i32 %new -} - -define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_acquire_i32_shared( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_param_2]; -; SM90-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release acquire - ret i32 %new -} - -define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_seq_cst_i32_generic( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release seq_cst - ret i32 %new -} - -define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_seq_cst_i32_global( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release seq_cst - ret i32 %new -} - -define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_seq_cst_i32_shared( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release seq_cst - ret i32 %new -} - -define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_monotonic_i32_generic( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_param_2]; -; SM90-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel monotonic - ret i32 %new -} - -define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_monotonic_i32_global( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_param_2]; -; SM90-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel monotonic - ret i32 %new -} - -define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_monotonic_i32_shared( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_param_2]; -; SM90-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel monotonic - ret i32 %new -} - -define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_acquire_i32_generic( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_param_2]; -; SM90-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel acquire - ret i32 %new -} - -define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_acquire_i32_global( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_param_2]; -; SM90-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire - ret i32 %new -} - -define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_acquire_i32_shared( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_param_2]; -; SM90-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel acquire - ret i32 %new -} - -define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_seq_cst_i32_generic( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel seq_cst - ret i32 %new -} - -define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_seq_cst_i32_global( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel seq_cst - ret i32 %new -} - -define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_seq_cst_i32_shared( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel seq_cst - ret i32 %new -} - -define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_monotonic_i32_generic( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst monotonic - ret i32 %new -} - -define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_monotonic_i32_global( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst monotonic - ret i32 %new -} - -define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_monotonic_i32_shared( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst monotonic - ret i32 %new -} - -define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_acquire_i32_generic( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst acquire - ret i32 %new -} - -define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_acquire_i32_global( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst acquire - ret i32 %new -} - -define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_acquire_i32_shared( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst acquire - ret i32 %new -} - -define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_seq_cst_i32_generic( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst - ret i32 %new -} - -define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_seq_cst_i32_global( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst seq_cst - ret i32 %new -} - -define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_seq_cst_i32_shared( +define i32 @monotonic_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_global_cta( ; SM90: { ; SM90-NEXT: .reg .b32 %r<4>; ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst seq_cst - ret i32 %new -} - -define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_monotonic_i64_generic( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_param_2]; -; SM90-NEXT: atom.relaxed.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic - ret i64 %new -} - -define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_monotonic_i64_global( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_param_2]; -; SM90-NEXT: atom.relaxed.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic monotonic - ret i64 %new -} - -define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_monotonic_i64_shared( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_param_2]; -; SM90-NEXT: atom.relaxed.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic monotonic - ret i64 %new -} - -define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_acquire_i64_generic( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic acquire - ret i64 %new -} - -define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_acquire_i64_global( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic acquire - ret i64 %new -} - -define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_acquire_i64_shared( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic acquire - ret i64 %new -} - -define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_seq_cst_i64_generic( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic seq_cst - ret i64 %new -} - -define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_seq_cst_i64_global( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic seq_cst - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire + ret i32 %new } -define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_seq_cst_i64_shared( +define i32 @monotonic_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_global_cta( ; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic seq_cst - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst + ret i32 %new } -define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_monotonic_i64_generic( +define i32 @acquire_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_global_cta( ; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic + ret i32 %new } -define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_monotonic_i64_global( +define i32 @acquire_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_global_cta( ; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire + ret i32 %new } -define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_monotonic_i64_shared( +define i32 @acquire_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_global_cta( ; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst + ret i32 %new } -define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_acquire_i64_generic( +define i32 @release_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_global_cta( ; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_cta_param_2]; +; SM90-NEXT: atom.release.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire acquire - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic + ret i32 %new } -define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_acquire_i64_global( +define i32 @release_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_global_cta( ; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire acquire - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release acquire + ret i32 %new } -define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_acquire_i64_shared( +define i32 @release_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_global_cta( ; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire acquire - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst + ret i32 %new } -define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_seq_cst_i64_generic( +define i32 @acq_rel_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_global_cta( ; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire seq_cst - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic + ret i32 %new } -define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_seq_cst_i64_global( +define i32 @acq_rel_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_global_cta( ; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire seq_cst - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire + ret i32 %new } -define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_seq_cst_i64_shared( +define i32 @acq_rel_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_global_cta( ; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire seq_cst - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst + ret i32 %new } -define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_monotonic_i64_generic( +define i32 @seq_cst_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_global_cta( ; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_param_2]; -; SM90-NEXT: atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic + ret i32 %new } -define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_monotonic_i64_global( +define i32 @seq_cst_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_global_cta( ; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_param_2]; -; SM90-NEXT: atom.release.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire + ret i32 %new } -define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_monotonic_i64_shared( +define i32 @seq_cst_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_global_cta( ; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_param_2]; -; SM90-NEXT: atom.release.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst + ret i32 %new } -define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_acquire_i64_generic( +define i64 @monotonic_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_global_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_param_2]; -; SM90-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_cta_param_2]; +; SM90-NEXT: atom.relaxed.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic ret i64 %new } -define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_acquire_i64_global( +define i64 @monotonic_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_global_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_param_2]; -; SM90-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire ret i64 %new } -define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_acquire_i64_shared( +define i64 @monotonic_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_global_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_param_2]; -; SM90-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst ret i64 %new } -define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_seq_cst_i64_generic( +define i64 @acquire_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_global_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic ret i64 %new } -define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_seq_cst_i64_global( +define i64 @acquire_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_global_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire ret i64 %new } -define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_seq_cst_i64_shared( +define i64 @acquire_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_global_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst ret i64 %new } -define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_monotonic_i64_generic( +define i64 @release_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_global_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_param_2]; -; SM90-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_cta_param_2]; +; SM90-NEXT: atom.release.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic ret i64 %new } -define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_monotonic_i64_global( +define i64 @release_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_global_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_param_2]; -; SM90-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release acquire ret i64 %new } -define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_monotonic_i64_shared( +define i64 @release_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_global_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_param_2]; -; SM90-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst ret i64 %new } -define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_acquire_i64_generic( +define i64 @acq_rel_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_global_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_param_2]; -; SM90-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic ret i64 %new } -define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_acquire_i64_global( +define i64 @acq_rel_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_global_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_param_2]; -; SM90-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire ret i64 %new } -define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_acquire_i64_shared( +define i64 @acq_rel_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_global_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_param_2]; -; SM90-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst ret i64 %new } -define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_seq_cst_i64_generic( +define i64 @seq_cst_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_global_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic ret i64 %new } -define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_seq_cst_i64_global( +define i64 @seq_cst_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_global_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire ret i64 %new } -define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_seq_cst_i64_shared( +define i64 @seq_cst_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_global_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_monotonic_i64_generic( +define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_global( ; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB60_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB60_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB60_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB60_1; +; SM90-NEXT: $L__BB60_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire + ret i8 %new } -define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_monotonic_i64_global( +define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_global( ; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_param_2]; +; SM90-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire + ret i32 %new } -define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_monotonic_i64_shared( +define i32 @acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_global_sys( ; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire + ret i32 %new } -define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_acquire_i64_generic( +define i32 @acq_rel_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_global_cluster( ; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst acquire - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel acquire + ret i32 %new } -define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_acquire_i64_global( +define i32 @acq_rel_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_global_gpu( ; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst acquire - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire + ret i32 %new } -define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_acquire_i64_shared( +define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_generic_cta( ; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_generic_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB65_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB65_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB65_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB65_1; +; SM90-NEXT: $L__BB65_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst acquire - ret i64 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + ret i8 %new } -define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_seq_cst_i64_generic( +define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_shared_cta( ; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_shared_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB66_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB66_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB66_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB66_1; +; SM90-NEXT: $L__BB66_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst - ret i64 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + ret i8 %new } -define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_seq_cst_i64_global( +define i32 @acq_rel_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_generic_cta( ; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst seq_cst - ret i64 %new + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire + ret i32 %new } -define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_seq_cst_i64_shared( +define i32 @acq_rel_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_shared_cta( ; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst seq_cst - ret i64 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire + ret i32 %new } diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.ll b/llvm/test/CodeGen/NVPTX/cmpxchg.ll index 25b4c74086dc1..237e42394ba2f 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg.ll @@ -79,7 +79,7 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB0_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -99,8 +99,8 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [relaxed_sys_i8_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [relaxed_sys_i8_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [relaxed_sys_i8_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [relaxed_sys_i8_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -111,9 +111,9 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [relaxed_sys_i8_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [relaxed_sys_i8_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB0_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -206,7 +206,7 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB1_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -227,8 +227,8 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_sys_i8_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_sys_i8_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_sys_i8_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_sys_i8_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -239,9 +239,9 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_sys_i8_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acquire_sys_i8_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -336,7 +336,7 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB2_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -356,8 +356,8 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_sys_i8_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_sys_i8_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [release_sys_i8_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_sys_i8_param_0]; ; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -369,9 +369,9 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_sys_i8_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [release_sys_i8_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -466,7 +466,7 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB3_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -487,8 +487,8 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_sys_i8_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i8_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_sys_i8_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i8_param_0]; ; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -500,9 +500,9 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_sys_i8_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_sys_i8_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB3_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -598,7 +598,7 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB4_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -619,8 +619,8 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_sys_i8_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i8_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_sys_i8_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i8_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -632,9 +632,9 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_sys_i8_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_sys_i8_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB4_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -726,7 +726,7 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB5_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -746,10 +746,10 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [relaxed_sys_i16_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [relaxed_sys_i16_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [relaxed_sys_i16_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [relaxed_sys_i16_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [relaxed_sys_i16_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [relaxed_sys_i16_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -759,7 +759,7 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB5_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -850,7 +850,7 @@ define i16 @acquire_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB6_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -871,10 +871,10 @@ define i16 @acquire_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_sys_i16_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_sys_i16_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acquire_sys_i16_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_sys_i16_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_sys_i16_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acquire_sys_i16_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -884,7 +884,7 @@ define i16 @acquire_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -977,7 +977,7 @@ define i16 @release_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB7_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -997,10 +997,10 @@ define i16 @release_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_sys_i16_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_sys_i16_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [release_sys_i16_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_sys_i16_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_sys_i16_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [release_sys_i16_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -1011,7 +1011,7 @@ define i16 @release_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB7_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1104,7 +1104,7 @@ define i16 @acq_rel_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB8_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1125,10 +1125,10 @@ define i16 @acq_rel_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_sys_i16_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i16_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_sys_i16_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i16_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_sys_i16_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_sys_i16_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -1139,7 +1139,7 @@ define i16 @acq_rel_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB8_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1234,7 +1234,7 @@ define i16 @seq_cst_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB9_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1255,10 +1255,10 @@ define i16 @seq_cst_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_sys_i16_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i16_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_sys_i16_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i16_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_sys_i16_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_sys_i16_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -1269,7 +1269,7 @@ define i16 @seq_cst_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1316,7 +1316,7 @@ define i32 @relaxed_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: ld.param.b64 %rd1, [relaxed_sys_i32_param_0]; ; SM70-NEXT: ld.param.b32 %r1, [relaxed_sys_i32_param_1]; ; SM70-NEXT: ld.param.b32 %r2, [relaxed_sys_i32_param_2]; -; SM70-NEXT: atom.relaxed.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; ; SM90-LABEL: relaxed_sys_i32( @@ -1325,9 +1325,9 @@ define i32 @relaxed_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [relaxed_sys_i32_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [relaxed_sys_i32_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [relaxed_sys_i32_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [relaxed_sys_i32_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [relaxed_sys_i32_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [relaxed_sys_i32_param_2]; ; SM90-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -1358,7 +1358,7 @@ define i32 @acq_rel_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_sys_i32_param_0]; ; SM70-NEXT: ld.param.b32 %r1, [acq_rel_sys_i32_param_1]; ; SM70-NEXT: ld.param.b32 %r2, [acq_rel_sys_i32_param_2]; -; SM70-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; ; SM90-LABEL: acq_rel_sys_i32( @@ -1367,9 +1367,9 @@ define i32 @acq_rel_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_sys_i32_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_sys_i32_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_sys_i32_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_sys_i32_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_sys_i32_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_sys_i32_param_2]; ; SM90-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -1400,7 +1400,7 @@ define i32 @acquire_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: ld.param.b64 %rd1, [acquire_sys_i32_param_0]; ; SM70-NEXT: ld.param.b32 %r1, [acquire_sys_i32_param_1]; ; SM70-NEXT: ld.param.b32 %r2, [acquire_sys_i32_param_2]; -; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; ; SM90-LABEL: acquire_sys_i32( @@ -1409,9 +1409,9 @@ define i32 @acquire_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_sys_i32_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_sys_i32_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_sys_i32_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_sys_i32_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_sys_i32_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_sys_i32_param_2]; ; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -1442,7 +1442,7 @@ define i32 @release_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: ld.param.b64 %rd1, [release_sys_i32_param_0]; ; SM70-NEXT: ld.param.b32 %r1, [release_sys_i32_param_1]; ; SM70-NEXT: ld.param.b32 %r2, [release_sys_i32_param_2]; -; SM70-NEXT: atom.release.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: atom.release.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; ; SM90-LABEL: release_sys_i32( @@ -1451,9 +1451,9 @@ define i32 @release_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_sys_i32_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_sys_i32_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_sys_i32_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_sys_i32_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_sys_i32_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_sys_i32_param_2]; ; SM90-NEXT: atom.release.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -1486,7 +1486,7 @@ define i32 @seq_cst_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: ld.param.b32 %r1, [seq_cst_sys_i32_param_1]; ; SM70-NEXT: ld.param.b32 %r2, [seq_cst_sys_i32_param_2]; -; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; ; SM90-LABEL: seq_cst_sys_i32( @@ -1495,10 +1495,10 @@ define i32 @seq_cst_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_sys_i32_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_sys_i32_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_sys_i32_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_sys_i32_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_sys_i32_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_sys_i32_param_2]; ; SM90-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; @@ -1529,7 +1529,7 @@ define i64 @relaxed_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: ld.param.b64 %rd1, [relaxed_sys_i64_param_0]; ; SM70-NEXT: ld.param.b64 %rd2, [relaxed_sys_i64_param_1]; ; SM70-NEXT: ld.param.b64 %rd3, [relaxed_sys_i64_param_2]; -; SM70-NEXT: atom.relaxed.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: atom.relaxed.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; ; SM90-LABEL: relaxed_sys_i64( @@ -1537,9 +1537,9 @@ define i64 @relaxed_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [relaxed_sys_i64_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [relaxed_sys_i64_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [relaxed_sys_i64_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [relaxed_sys_i64_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [relaxed_sys_i64_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [relaxed_sys_i64_param_2]; ; SM90-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -1568,7 +1568,7 @@ define i64 @acquire_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: ld.param.b64 %rd1, [acquire_sys_i64_param_0]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_sys_i64_param_1]; ; SM70-NEXT: ld.param.b64 %rd3, [acquire_sys_i64_param_2]; -; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; ; SM90-LABEL: acquire_sys_i64( @@ -1576,9 +1576,9 @@ define i64 @acquire_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_sys_i64_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_sys_i64_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_sys_i64_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_sys_i64_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_sys_i64_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_sys_i64_param_2]; ; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -1607,7 +1607,7 @@ define i64 @acq_rel_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_sys_i64_param_0]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i64_param_1]; ; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_sys_i64_param_2]; -; SM70-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; ; SM90-LABEL: acq_rel_sys_i64( @@ -1615,9 +1615,9 @@ define i64 @acq_rel_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_sys_i64_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i64_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_sys_i64_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_sys_i64_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i64_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_sys_i64_param_2]; ; SM90-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -1646,7 +1646,7 @@ define i64 @release_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: ld.param.b64 %rd1, [release_sys_i64_param_0]; ; SM70-NEXT: ld.param.b64 %rd2, [release_sys_i64_param_1]; ; SM70-NEXT: ld.param.b64 %rd3, [release_sys_i64_param_2]; -; SM70-NEXT: atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: atom.release.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; ; SM90-LABEL: release_sys_i64( @@ -1654,9 +1654,9 @@ define i64 @release_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_sys_i64_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_sys_i64_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_sys_i64_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_sys_i64_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_sys_i64_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_sys_i64_param_2]; ; SM90-NEXT: atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -1687,7 +1687,7 @@ define i64 @seq_cst_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i64_param_1]; ; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_sys_i64_param_2]; -; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; ; SM90-LABEL: seq_cst_sys_i64( @@ -1695,10 +1695,10 @@ define i64 @seq_cst_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_sys_i64_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_sys_i64_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i64_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_sys_i64_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i64_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_sys_i64_param_2]; ; SM90-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.py b/llvm/test/CodeGen/NVPTX/cmpxchg.py index ae7450015ecd2..75623a59ad481 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg.py +++ b/llvm/test/CodeGen/NVPTX/cmpxchg.py @@ -5,6 +5,14 @@ from itertools import product cmpxchg_func = Template( + """define i$size @${success}_${failure}_i${size}_${addrspace}_${ptx_scope}(ptr${addrspace_cast} %addr, i$size %cmp, i$size %new) { + %pairold = cmpxchg ptr${addrspace_cast} %addr, i$size %cmp, i$size %new syncscope(\"${llvm_scope}\") $success $failure + ret i$size %new +} +""" +) + +cmpxchg_func_no_scope = Template( """define i$size @${success}_${failure}_i${size}_${addrspace}(ptr${addrspace_cast} %addr, i$size %cmp, i$size %new) { %pairold = cmpxchg ptr${addrspace_cast} %addr, i$size %cmp, i$size %new $success $failure ret i$size %new @@ -18,6 +26,14 @@ """ ) + +def get_addrspace_cast(addrspace): + if addrspace == 0: + return "" + else: + return " addrspace({})".format(str(addrspace)) + + TESTS = [(60, 50), (70, 63), (90, 87)] LLVM_SCOPES = ["", "block", "cluster", "device"] @@ -34,24 +50,84 @@ ADDRSPACE_NUM_TO_ADDRSPACE = {0: "generic", 1: "global", 3: "shared"} + if __name__ == "__main__": for sm, ptx in TESTS: with open("cmpxchg-sm{}.ll".format(str(sm)), "w") as fp: print(run_statement.substitute(sm=sm, ptx=ptx), file=fp) - for size, success, failure, addrspace in product( - SIZES, SUCCESS_ORDERINGS, FAILURE_ORDERINGS, ADDRSPACES + + # Our test space is: SIZES X SUCCESS_ORDERINGS X FAILURE_ORDERINGS X ADDRSPACES X LLVM_SCOPES + # This is very large, so we instead test 3 slices. + + # First slice: are all orderings correctly supported, with and without emulation loops? + # set addrspace to global, scope to cta, generate all possible orderings, for all operation sizes + addrspace, llvm_scope = 1, "block" + for size, success, failure in product( + SIZES, SUCCESS_ORDERINGS, FAILURE_ORDERINGS ): - if addrspace == 0: - addrspace_cast = "" - else: - addrspace_cast = " addrspace({})".format(str(addrspace)) print( cmpxchg_func.substitute( success=success, failure=failure, size=size, addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace], - addrspace_cast=addrspace_cast, + addrspace_cast=get_addrspace_cast(addrspace), + llvm_scope=llvm_scope, + ptx_scope=SCOPE_LLVM_TO_PTX[llvm_scope], + ), + file=fp, + ) + + # Second slice: Are all scopes correctlly supported, with and without emulation loops? + # fix addrspace, ordering, generate all possible scopes, for operation sizes i8, i32 + addrspace, success, failure = 1, "acq_rel", "acquire" + for size in [8, 32]: + print( + cmpxchg_func_no_scope.substitute( + success=success, + failure=failure, + size=size, + addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace], + addrspace_cast=get_addrspace_cast(addrspace), + ), + file=fp, + ) + + for llvm_scope in LLVM_SCOPES: + if sm < 90 and llvm_scope == "cluster": + continue + if llvm_scope == "block": + # skip (acq_rel, acquire, global, cta) + continue + print( + cmpxchg_func.substitute( + success=success, + failure=failure, + size=size, + addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace], + addrspace_cast=get_addrspace_cast(addrspace), + llvm_scope=llvm_scope, + ptx_scope=SCOPE_LLVM_TO_PTX[llvm_scope], + ), + file=fp, + ) + + # Third slice: Are all address spaces correctly supported? + # fix ordering, scope, generate all possible address spaces, for operation sizes i8, i32 + success, failure, llvm_scope = "acq_rel", "acquire", "block" + for size, addrspace in product([8, 32], ADDRSPACES): + if addrspace == 1: + # skip (acq_rel, acquire, global, cta) + continue + print( + cmpxchg_func.substitute( + success=success, + failure=failure, + size=size, + addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace], + addrspace_cast=get_addrspace_cast(addrspace), + llvm_scope=llvm_scope, + ptx_scope=SCOPE_LLVM_TO_PTX[llvm_scope], ), file=fp, ) diff --git a/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll b/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll index a1020e68e1bae..2841e6751d029 100644 --- a/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll +++ b/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll @@ -171,30 +171,30 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: ld.param.b64 %rd2, [test_distributed_shared_cluster_cmpxchg_param_0]; -; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b32 %r24, [%rd2], 1, 0; -; CHECK-NEXT: atom.acquire.shared::cluster.cas.b32 %r25, [%rd2], 1, 0; -; CHECK-NEXT: atom.acquire.shared::cluster.cas.b32 %r26, [%rd2], 1, 0; -; CHECK-NEXT: atom.release.shared::cluster.cas.b32 %r27, [%rd2], 1, 0; -; CHECK-NEXT: atom.acq_rel.shared::cluster.cas.b32 %r28, [%rd2], 1, 0; -; CHECK-NEXT: atom.acq_rel.shared::cluster.cas.b32 %r29, [%rd2], 1, 0; +; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b32 %r24, [%rd2], 1, 0; +; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b32 %r25, [%rd2], 1, 0; +; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b32 %r26, [%rd2], 1, 0; +; CHECK-NEXT: atom.release.sys.shared::cluster.cas.b32 %r27, [%rd2], 1, 0; +; CHECK-NEXT: atom.acq_rel.sys.shared::cluster.cas.b32 %r28, [%rd2], 1, 0; +; CHECK-NEXT: atom.acq_rel.sys.shared::cluster.cas.b32 %r29, [%rd2], 1, 0; ; CHECK-NEXT: fence.sc.sys; -; CHECK-NEXT: atom.acquire.shared::cluster.cas.b32 %r30, [%rd2], 1, 0; +; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b32 %r30, [%rd2], 1, 0; ; CHECK-NEXT: fence.sc.sys; -; CHECK-NEXT: atom.acquire.shared::cluster.cas.b32 %r31, [%rd2], 1, 0; +; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b32 %r31, [%rd2], 1, 0; ; CHECK-NEXT: fence.sc.sys; -; CHECK-NEXT: atom.acquire.shared::cluster.cas.b32 %r32, [%rd2], 1, 0; -; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b64 %rd3, [%rd2], 1, 0; -; CHECK-NEXT: atom.acquire.shared::cluster.cas.b64 %rd4, [%rd2], 1, 0; -; CHECK-NEXT: atom.acquire.shared::cluster.cas.b64 %rd5, [%rd2], 1, 0; -; CHECK-NEXT: atom.release.shared::cluster.cas.b64 %rd6, [%rd2], 1, 0; -; CHECK-NEXT: atom.acq_rel.shared::cluster.cas.b64 %rd7, [%rd2], 1, 0; -; CHECK-NEXT: atom.acq_rel.shared::cluster.cas.b64 %rd8, [%rd2], 1, 0; +; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b32 %r32, [%rd2], 1, 0; +; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b64 %rd3, [%rd2], 1, 0; +; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b64 %rd4, [%rd2], 1, 0; +; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b64 %rd5, [%rd2], 1, 0; +; CHECK-NEXT: atom.release.sys.shared::cluster.cas.b64 %rd6, [%rd2], 1, 0; +; CHECK-NEXT: atom.acq_rel.sys.shared::cluster.cas.b64 %rd7, [%rd2], 1, 0; +; CHECK-NEXT: atom.acq_rel.sys.shared::cluster.cas.b64 %rd8, [%rd2], 1, 0; ; CHECK-NEXT: fence.sc.sys; -; CHECK-NEXT: atom.acquire.shared::cluster.cas.b64 %rd9, [%rd2], 1, 0; +; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b64 %rd9, [%rd2], 1, 0; ; CHECK-NEXT: fence.sc.sys; -; CHECK-NEXT: atom.acquire.shared::cluster.cas.b64 %rd10, [%rd2], 1, 0; +; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b64 %rd10, [%rd2], 1, 0; ; CHECK-NEXT: fence.sc.sys; -; CHECK-NEXT: atom.acquire.shared::cluster.cas.b64 %rd11, [%rd2], 1, 0; +; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b64 %rd11, [%rd2], 1, 0; ; CHECK-NEXT: and.b64 %rd1, %rd2, -4; ; CHECK-NEXT: cvt.u32.u64 %r33, %rd2; ; CHECK-NEXT: and.b32 %r34, %r33, 3; @@ -209,7 +209,7 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr ; CHECK-NEXT: $L__BB4_1: // %partword.cmpxchg.loop33 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: or.b32 %r39, %r48, %r3; -; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b32 %r6, [%rd1], %r39, %r48; +; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b32 %r6, [%rd1], %r39, %r48; ; CHECK-NEXT: setp.eq.b32 %p1, %r6, %r39; ; CHECK-NEXT: @%p1 bra $L__BB4_3; ; CHECK-NEXT: // %bb.2: // %partword.cmpxchg.failure32 @@ -224,7 +224,7 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr ; CHECK-NEXT: $L__BB4_4: // %partword.cmpxchg.loop23 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: or.b32 %r41, %r49, %r3; -; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b32 %r10, [%rd1], %r41, %r49; +; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b32 %r10, [%rd1], %r41, %r49; ; CHECK-NEXT: setp.eq.b32 %p3, %r10, %r41; ; CHECK-NEXT: @%p3 bra $L__BB4_6; ; CHECK-NEXT: // %bb.5: // %partword.cmpxchg.failure22 @@ -241,7 +241,7 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr ; CHECK-NEXT: $L__BB4_7: // %partword.cmpxchg.loop13 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: or.b32 %r43, %r50, %r3; -; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b32 %r14, [%rd1], %r43, %r50; +; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b32 %r14, [%rd1], %r43, %r50; ; CHECK-NEXT: setp.eq.b32 %p5, %r14, %r43; ; CHECK-NEXT: @%p5 bra $L__BB4_9; ; CHECK-NEXT: // %bb.8: // %partword.cmpxchg.failure12 @@ -257,7 +257,7 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr ; CHECK-NEXT: $L__BB4_10: // %partword.cmpxchg.loop3 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: or.b32 %r45, %r51, %r3; -; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b32 %r18, [%rd1], %r45, %r51; +; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b32 %r18, [%rd1], %r45, %r51; ; CHECK-NEXT: setp.eq.b32 %p7, %r18, %r45; ; CHECK-NEXT: @%p7 bra $L__BB4_12; ; CHECK-NEXT: // %bb.11: // %partword.cmpxchg.failure2 @@ -274,7 +274,7 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr ; CHECK-NEXT: $L__BB4_13: // %partword.cmpxchg.loop ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: or.b32 %r47, %r52, %r3; -; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b32 %r22, [%rd1], %r47, %r52; +; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b32 %r22, [%rd1], %r47, %r52; ; CHECK-NEXT: setp.eq.b32 %p9, %r22, %r47; ; CHECK-NEXT: @%p9 bra $L__BB4_15; ; CHECK-NEXT: // %bb.14: // %partword.cmpxchg.failure