diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 3300ed9a5a81c..30feceba85a7e 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -170,6 +170,10 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) { } break; } + case NVPTXISD::ATOMIC_CMP_SWAP_B128: + case NVPTXISD::ATOMIC_SWAP_B128: + selectAtomicSwap128(N); + return; case ISD::FADD: case ISD::FMUL: case ISD::FSUB: @@ -2337,3 +2341,28 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) { } } } + +void NVPTXDAGToDAGISel::selectAtomicSwap128(SDNode *N) { + MemSDNode *AN = cast(N); + SDLoc dl(N); + + const auto [Base, Offset] = selectADDR(N->getOperand(1), CurDAG); + SmallVector Ops{Base, Offset}; + Ops.append(N->op_begin() + 2, N->op_end()); + Ops.append({ + getI32Imm(getMemOrder(AN), dl), + getI32Imm(getAtomicScope(AN), dl), + getI32Imm(getAddrSpace(AN), dl), + }); + + assert(N->getOpcode() == NVPTXISD::ATOMIC_CMP_SWAP_B128 || + N->getOpcode() == NVPTXISD::ATOMIC_SWAP_B128); + unsigned Opcode = N->getOpcode() == NVPTXISD::ATOMIC_SWAP_B128 + ? NVPTX::ATOM_EXCH_B128 + : NVPTX::ATOM_CAS_B128; + + auto *ATOM = CurDAG->getMachineNode(Opcode, dl, N->getVTList(), Ops); + CurDAG->setNodeMemRefs(ATOM, AN->getMemOperand()); + + ReplaceNode(N, ATOM); +} diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index e2ad55bc1796d..8dcd5362c4512 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -90,6 +90,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { bool IsIm2Col = false); void SelectTcgen05Ld(SDNode *N, bool hasOffset = false); void SelectTcgen05St(SDNode *N, bool hasOffset = false); + void selectAtomicSwap128(SDNode *N); inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) { return CurDAG->getTargetConstant(Imm, DL, MVT::i32); diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 997c33f1f6a76..1224e11158d47 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -1036,7 +1036,11 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom); setOperationAction(ISD::ATOMIC_LOAD_SUB, {MVT::i32, MVT::i64}, Expand); - // No FPOW or FREM in PTX. + + // atom.b128 is legal in PTX but since we don't represent i128 as a legal + // type, we need to custom lower it. + setOperationAction({ISD::ATOMIC_CMP_SWAP, ISD::ATOMIC_SWAP}, MVT::i128, + Custom); // Now deduce the information based on the above mentioned // actions @@ -1044,7 +1048,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, // PTX support for 16-bit CAS is emulated. Only use 32+ setMinCmpXchgSizeInBits(STI.getMinCmpXchgSizeInBits()); - setMaxAtomicSizeInBitsSupported(64); + setMaxAtomicSizeInBitsSupported(STI.hasAtomSwap128() ? 128 : 64); setMaxDivRemBitWidthSupported(64); // Custom lowering for tcgen05.ld vector operands @@ -1077,6 +1081,8 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { case NVPTXISD::FIRST_NUMBER: break; + MAKE_CASE(NVPTXISD::ATOMIC_CMP_SWAP_B128) + MAKE_CASE(NVPTXISD::ATOMIC_SWAP_B128) MAKE_CASE(NVPTXISD::RET_GLUE) MAKE_CASE(NVPTXISD::DeclareArrayParam) MAKE_CASE(NVPTXISD::DeclareScalarParam) @@ -6236,6 +6242,49 @@ static void replaceProxyReg(SDNode *N, SelectionDAG &DAG, Results.push_back(Res); } +static void replaceAtomicSwap128(SDNode *N, SelectionDAG &DAG, + const NVPTXSubtarget &STI, + SmallVectorImpl &Results) { + assert(N->getValueType(0) == MVT::i128 && + "Custom lowering for atomic128 only supports i128"); + + AtomicSDNode *AN = cast(N); + SDLoc dl(N); + + if (!STI.hasAtomSwap128()) { + DAG.getContext()->diagnose(DiagnosticInfoUnsupported( + DAG.getMachineFunction().getFunction(), + "Support for b128 atomics introduced in PTX ISA version 8.3 and " + "requires target sm_90.", + dl.getDebugLoc())); + + Results.push_back(DAG.getUNDEF(MVT::i128)); + Results.push_back(AN->getOperand(0)); // Chain + return; + } + + SmallVector Ops; + Ops.push_back(AN->getOperand(0)); // Chain + Ops.push_back(AN->getOperand(1)); // Ptr + for (const auto &Op : AN->ops().drop_front(2)) { + // Low part + Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i64, Op, + DAG.getIntPtrConstant(0, dl))); + // High part + Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i64, Op, + DAG.getIntPtrConstant(1, dl))); + } + unsigned Opcode = N->getOpcode() == ISD::ATOMIC_SWAP + ? NVPTXISD::ATOMIC_SWAP_B128 + : NVPTXISD::ATOMIC_CMP_SWAP_B128; + SDVTList Tys = DAG.getVTList(MVT::i64, MVT::i64, MVT::Other); + SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, MVT::i128, + AN->getMemOperand()); + Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i128, + {Result.getValue(0), Result.getValue(1)})); + Results.push_back(Result.getValue(2)); +} + void NVPTXTargetLowering::ReplaceNodeResults( SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const { switch (N->getOpcode()) { @@ -6256,6 +6305,10 @@ void NVPTXTargetLowering::ReplaceNodeResults( case NVPTXISD::ProxyReg: replaceProxyReg(N, DAG, *this, Results); return; + case ISD::ATOMIC_CMP_SWAP: + case ISD::ATOMIC_SWAP: + replaceAtomicSwap128(N, DAG, STI, Results); + return; } } @@ -6280,16 +6333,19 @@ NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { } assert(Ty->isIntegerTy() && "Ty should be integer at this point"); - auto ITy = cast(Ty); + const unsigned BitWidth = cast(Ty)->getBitWidth(); switch (AI->getOperation()) { default: return AtomicExpansionKind::CmpXChg; + case AtomicRMWInst::BinOp::Xchg: + if (BitWidth == 128) + return AtomicExpansionKind::None; + LLVM_FALLTHROUGH; case AtomicRMWInst::BinOp::And: case AtomicRMWInst::BinOp::Or: case AtomicRMWInst::BinOp::Xor: - case AtomicRMWInst::BinOp::Xchg: - switch (ITy->getBitWidth()) { + switch (BitWidth) { case 8: case 16: return AtomicExpansionKind::CmpXChg; @@ -6299,6 +6355,8 @@ NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { if (STI.hasAtomBitwise64()) return AtomicExpansionKind::None; return AtomicExpansionKind::CmpXChg; + case 128: + return AtomicExpansionKind::CmpXChg; default: llvm_unreachable("unsupported width encountered"); } @@ -6308,7 +6366,7 @@ NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { case AtomicRMWInst::BinOp::Min: case AtomicRMWInst::BinOp::UMax: case AtomicRMWInst::BinOp::UMin: - switch (ITy->getBitWidth()) { + switch (BitWidth) { case 8: case 16: return AtomicExpansionKind::CmpXChg; @@ -6318,17 +6376,20 @@ NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { if (STI.hasAtomMinMax64()) return AtomicExpansionKind::None; return AtomicExpansionKind::CmpXChg; + case 128: + return AtomicExpansionKind::CmpXChg; default: llvm_unreachable("unsupported width encountered"); } case AtomicRMWInst::BinOp::UIncWrap: case AtomicRMWInst::BinOp::UDecWrap: - switch (ITy->getBitWidth()) { + switch (BitWidth) { case 32: return AtomicExpansionKind::None; case 8: case 16: case 64: + case 128: return AtomicExpansionKind::CmpXChg; default: llvm_unreachable("unsupported width encountered"); diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index e7f1a4b4c98c4..80d2f626767ea 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -81,7 +81,17 @@ enum NodeType : unsigned { CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Z, FIRST_MEMORY_OPCODE, - LoadV2 = FIRST_MEMORY_OPCODE, + + /// These nodes are used to lower atomic instructions with i128 type. They are + /// similar to the generic nodes, but the input and output values are split + /// into two 64-bit values. + /// ValLo, ValHi, OUTCHAIN = ATOMIC_CMP_SWAP_B128(INCHAIN, ptr, cmpLo, cmpHi, + /// swapLo, swapHi) + /// ValLo, ValHi, OUTCHAIN = ATOMIC_SWAP_B128(INCHAIN, ptr, amtLo, amtHi) + ATOMIC_CMP_SWAP_B128 = FIRST_MEMORY_OPCODE, + ATOMIC_SWAP_B128, + + LoadV2, LoadV4, LoadV8, LDUV2, // LDU.v2 diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 4d6f7b3d96601..7f29c3788d810 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -104,6 +104,7 @@ def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">; def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">; def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">; def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">; +def hasAtomSwap128 : Predicate<"Subtarget->hasAtomSwap128()">; def hasClusters : Predicate<"Subtarget->hasClusters()">; def hasPTXASUnreachableBug : Predicate<"Subtarget->hasPTXASUnreachableBug()">; def noPTXASUnreachableBug : Predicate<"!Subtarget->hasPTXASUnreachableBug()">; diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index cba14066f0c0b..c544911bdf1e3 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -1990,19 +1990,23 @@ multiclass F_ATOMIC_3; def _ir : BasicFlagsNVPTXInst<(outs t.RC:$dst), - (ins ADDR:$addr, t.Imm:$b, t.RC:$c), (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp), + (ins ADDR:$addr, t.Imm:$b, t.RC:$c), + (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp), asm_str>; def _ri : BasicFlagsNVPTXInst<(outs t.RC:$dst), - (ins ADDR:$addr, t.RC:$b, t.Imm:$c), (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp), + (ins ADDR:$addr, t.RC:$b, t.Imm:$c), + (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp), asm_str>; def _ii : BasicFlagsNVPTXInst<(outs t.RC:$dst), - (ins ADDR:$addr, t.Imm:$b, t.Imm:$c), (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp), + (ins ADDR:$addr, t.Imm:$b, t.Imm:$c), + (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp), asm_str>; } @@ -2200,6 +2204,37 @@ defm INT_PTX_SATOM_MIN : ATOM2_minmax_impl<"min">; defm INT_PTX_SATOM_OR : ATOM2_bitwise_impl<"or">; defm INT_PTX_SATOM_XOR : ATOM2_bitwise_impl<"xor">; +// atom.*.b128 + +let mayLoad = true, mayStore = true, hasSideEffects = true, + Predicates = [hasAtomSwap128] in { + def ATOM_CAS_B128 : + NVPTXInst< + (outs B64:$dst0, B64:$dst1), + (ins ADDR:$addr, B64:$cmp0, B64:$cmp1, B64:$swap0, B64:$swap1, + AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp), + "{{\n\t" + ".reg .b128 cmp, swap, dst;\n\t" + "mov.b128 cmp, {$cmp0, $cmp1};\n\t" + "mov.b128 swap, {$swap0, $swap1};\n\t" + "atom${sem:sem}${scope:scope}${addsp:addsp}.cas.b128 dst, [$addr], cmp, swap;\n\t" + "mov.b128 {$dst0, $dst1}, dst;\n\t" + "}}">; + + def ATOM_EXCH_B128 : + NVPTXInst< + (outs B64:$dst0, B64:$dst1), + (ins ADDR:$addr, B64:$amt0, B64:$amt1, + AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp), + "{{\n\t" + ".reg .b128 amt, dst;\n\t" + "mov.b128 amt, {$amt0, $amt1};\n\t" + "atom${sem:sem}${scope:scope}${addsp:addsp}.exch.b128 dst, [$addr], amt;\n\t" + "mov.b128 {$dst0, $dst1}, dst;\n\t" + "}}">; +} + + //----------------------------------- // Support for ldu on sm_20 or later //----------------------------------- diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index acf025b70ce34..6cee4ff52ae0c 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -82,6 +82,7 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo { bool hasAtomBitwise64() const { return SmVersion >= 32; } bool hasAtomMinMax64() const { return SmVersion >= 32; } bool hasAtomCas16() const { return SmVersion >= 70 && PTXVersion >= 63; } + bool hasAtomSwap128() const { return SmVersion >= 90 && PTXVersion >= 83; } bool hasClusters() const { return SmVersion >= 90 && PTXVersion >= 78; } bool hasLDG() const { return SmVersion >= 32; } bool hasHWROT32() const { return SmVersion >= 32; } diff --git a/llvm/test/CodeGen/NVPTX/atomicrmw-expand.err.ll b/llvm/test/CodeGen/NVPTX/atomicrmw-expand.err.ll index b19f6d56b9a91..392cd8b26d27e 100644 --- a/llvm/test/CodeGen/NVPTX/atomicrmw-expand.err.ll +++ b/llvm/test/CodeGen/NVPTX/atomicrmw-expand.err.ll @@ -4,12 +4,12 @@ ; CHECK: error: unsupported cmpxchg ; CHECK: error: unsupported cmpxchg ; CHECK: error: unsupported cmpxchg -define void @bitwise_i128(ptr %0, i128 %1) { +define void @bitwise_i256(ptr %0, i256 %1) { entry: - %2 = atomicrmw and ptr %0, i128 %1 monotonic, align 16 - %3 = atomicrmw or ptr %0, i128 %1 monotonic, align 16 - %4 = atomicrmw xor ptr %0, i128 %1 monotonic, align 16 - %5 = atomicrmw xchg ptr %0, i128 %1 monotonic, align 16 + %2 = atomicrmw and ptr %0, i256 %1 monotonic, align 16 + %3 = atomicrmw or ptr %0, i256 %1 monotonic, align 16 + %4 = atomicrmw xor ptr %0, i256 %1 monotonic, align 16 + %5 = atomicrmw xchg ptr %0, i256 %1 monotonic, align 16 ret void } @@ -17,11 +17,11 @@ entry: ; CHECK: error: unsupported cmpxchg ; CHECK: error: unsupported cmpxchg ; CHECK: error: unsupported cmpxchg -define void @minmax_i128(ptr %0, i128 %1) { +define void @minmax_i256(ptr %0, i256 %1) { entry: - %2 = atomicrmw min ptr %0, i128 %1 monotonic, align 16 - %3 = atomicrmw max ptr %0, i128 %1 monotonic, align 16 - %4 = atomicrmw umin ptr %0, i128 %1 monotonic, align 16 - %5 = atomicrmw umax ptr %0, i128 %1 monotonic, align 16 + %2 = atomicrmw min ptr %0, i256 %1 monotonic, align 16 + %3 = atomicrmw max ptr %0, i256 %1 monotonic, align 16 + %4 = atomicrmw umin ptr %0, i256 %1 monotonic, align 16 + %5 = atomicrmw umax ptr %0, i256 %1 monotonic, align 16 ret void } diff --git a/llvm/test/CodeGen/NVPTX/atomics-b128.ll b/llvm/test/CodeGen/NVPTX/atomics-b128.ll new file mode 100644 index 0000000000000..7cae7ebb642b3 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/atomics-b128.ll @@ -0,0 +1,1003 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: not llc < %s -mcpu=sm_90 -mattr=+ptx82 2>&1 | FileCheck %s --check-prefix=ERROR +; RUN: not llc < %s -mcpu=sm_80 -mattr=+ptx84 2>&1 | FileCheck %s --check-prefix=ERROR +; RUN: llc < %s -mcpu=sm_90 -mattr=+ptx84 | FileCheck %s --check-prefix=CHECK +; RUN: %if ptxas-sm_90 && ptxas-isa-8.4 %{ llc < %s -mcpu=sm_90 -mattr=+ptx84 | %ptxas-verify -arch=sm_90 %} + +;; TODO: Update cmpxchg.py so that it can automatically generate the IR for +;; these test cases. + +target triple = "nvptx64-nvidia-cuda" + +;; Check that the first couple of error messages are correct. +; ERROR: error: unsupported cmpxchg +; ERROR: error: unsupported cmpxchg + +define i128 @test_xchg_generic(ptr %addr, i128 %amt) { +; CHECK-LABEL: test_xchg_generic( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_xchg_generic_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_xchg_generic_param_1]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 amt, dst; +; CHECK-NEXT: mov.b128 amt, {%rd2, %rd3}; +; CHECK-NEXT: atom.release.sys.exch.b128 dst, [%rd1], amt; +; CHECK-NEXT: mov.b128 {%rd4, %rd5}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %old = atomicrmw xchg ptr %addr, i128 %amt release + ret i128 %old +} + +define i128 @test_xchg_global(ptr addrspace(1) %addr, i128 %amt) { +; CHECK-LABEL: test_xchg_global( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_xchg_global_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_xchg_global_param_1]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 amt, dst; +; CHECK-NEXT: mov.b128 amt, {%rd2, %rd3}; +; CHECK-NEXT: atom.release.sys.global.exch.b128 dst, [%rd1], amt; +; CHECK-NEXT: mov.b128 {%rd4, %rd5}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %old = atomicrmw xchg ptr addrspace(1) %addr, i128 %amt release + ret i128 %old +} + +define i128 @test_xchg_shared(ptr addrspace(3) %addr, i128 %amt) { +; CHECK-LABEL: test_xchg_shared( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_xchg_shared_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_xchg_shared_param_1]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 amt, dst; +; CHECK-NEXT: mov.b128 amt, {%rd2, %rd3}; +; CHECK-NEXT: atom.release.sys.shared.exch.b128 dst, [%rd1], amt; +; CHECK-NEXT: mov.b128 {%rd4, %rd5}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %old = atomicrmw xchg ptr addrspace(3) %addr, i128 %amt release + ret i128 %old +} + +define i128 @test_xchg_shared_cluster(ptr addrspace(7) %addr, i128 %amt) { +; CHECK-LABEL: test_xchg_shared_cluster( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_xchg_shared_cluster_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_xchg_shared_cluster_param_1]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 amt, dst; +; CHECK-NEXT: mov.b128 amt, {%rd2, %rd3}; +; CHECK-NEXT: atom.release.sys.shared::cluster.exch.b128 dst, [%rd1], amt; +; CHECK-NEXT: mov.b128 {%rd4, %rd5}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %old = atomicrmw xchg ptr addrspace(7) %addr, i128 %amt release + ret i128 %old +} + +define i128 @test_xchg_block(ptr %addr, i128 %amt) { +; CHECK-LABEL: test_xchg_block( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_xchg_block_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_xchg_block_param_1]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 amt, dst; +; CHECK-NEXT: mov.b128 amt, {%rd2, %rd3}; +; CHECK-NEXT: atom.release.cta.exch.b128 dst, [%rd1], amt; +; CHECK-NEXT: mov.b128 {%rd4, %rd5}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %old = atomicrmw xchg ptr %addr, i128 %amt syncscope("block") release + ret i128 %old +} + +define i128 @test_xchg_cluster(ptr %addr, i128 %amt) { +; CHECK-LABEL: test_xchg_cluster( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_xchg_cluster_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_xchg_cluster_param_1]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 amt, dst; +; CHECK-NEXT: mov.b128 amt, {%rd2, %rd3}; +; CHECK-NEXT: atom.release.cluster.exch.b128 dst, [%rd1], amt; +; CHECK-NEXT: mov.b128 {%rd4, %rd5}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %old = atomicrmw xchg ptr %addr, i128 %amt syncscope("cluster") release + ret i128 %old +} + +define i128 @test_xchg_gpu(ptr %addr, i128 %amt) { +; CHECK-LABEL: test_xchg_gpu( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_xchg_gpu_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_xchg_gpu_param_1]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 amt, dst; +; CHECK-NEXT: mov.b128 amt, {%rd2, %rd3}; +; CHECK-NEXT: atom.release.gpu.exch.b128 dst, [%rd1], amt; +; CHECK-NEXT: mov.b128 {%rd4, %rd5}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %old = atomicrmw xchg ptr %addr, i128 %amt syncscope("device") release + ret i128 %old +} + +define i128 @test_xchg_sys(ptr %addr, i128 %amt) { +; CHECK-LABEL: test_xchg_sys( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_xchg_sys_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_xchg_sys_param_1]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 amt, dst; +; CHECK-NEXT: mov.b128 amt, {%rd2, %rd3}; +; CHECK-NEXT: atom.release.sys.exch.b128 dst, [%rd1], amt; +; CHECK-NEXT: mov.b128 {%rd4, %rd5}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %old = atomicrmw xchg ptr %addr, i128 %amt release + ret i128 %old +} + +define i128 @test_xchg_relaxed(ptr %addr, i128 %amt) { +; CHECK-LABEL: test_xchg_relaxed( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_xchg_relaxed_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_xchg_relaxed_param_1]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 amt, dst; +; CHECK-NEXT: mov.b128 amt, {%rd2, %rd3}; +; CHECK-NEXT: atom.relaxed.sys.exch.b128 dst, [%rd1], amt; +; CHECK-NEXT: mov.b128 {%rd4, %rd5}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %old = atomicrmw xchg ptr %addr, i128 %amt monotonic + ret i128 %old +} + +define i128 @test_xchg_acquire(ptr %addr, i128 %amt) { +; CHECK-LABEL: test_xchg_acquire( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_xchg_acquire_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_xchg_acquire_param_1]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 amt, dst; +; CHECK-NEXT: mov.b128 amt, {%rd2, %rd3}; +; CHECK-NEXT: atom.acquire.sys.exch.b128 dst, [%rd1], amt; +; CHECK-NEXT: mov.b128 {%rd4, %rd5}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %old = atomicrmw xchg ptr %addr, i128 %amt acquire + ret i128 %old +} + +define i128 @test_xchg_release(ptr %addr, i128 %amt) { +; CHECK-LABEL: test_xchg_release( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_xchg_release_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_xchg_release_param_1]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 amt, dst; +; CHECK-NEXT: mov.b128 amt, {%rd2, %rd3}; +; CHECK-NEXT: atom.release.sys.exch.b128 dst, [%rd1], amt; +; CHECK-NEXT: mov.b128 {%rd4, %rd5}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %old = atomicrmw xchg ptr %addr, i128 %amt release + ret i128 %old +} + +define i128 @test_xchg_acq_rel(ptr %addr, i128 %amt) { +; CHECK-LABEL: test_xchg_acq_rel( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_xchg_acq_rel_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_xchg_acq_rel_param_1]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 amt, dst; +; CHECK-NEXT: mov.b128 amt, {%rd2, %rd3}; +; CHECK-NEXT: atom.acq_rel.sys.exch.b128 dst, [%rd1], amt; +; CHECK-NEXT: mov.b128 {%rd4, %rd5}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %old = atomicrmw xchg ptr %addr, i128 %amt acq_rel + ret i128 %old +} + +define i128 @test_cmpxchg_generic(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_generic( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_generic_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_generic_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_generic_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.relaxed.sys.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new monotonic monotonic + ret i128 %new +} + +define i128 @test_cmpxchg_global(ptr addrspace(1) %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_global( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_global_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_global_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_global_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.relaxed.sys.global.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i128 %cmp, i128 %new monotonic monotonic + ret i128 %new +} + +define i128 @test_cmpxchg_shared(ptr addrspace(3) %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_shared( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_shared_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_shared_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_shared_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.relaxed.sys.shared.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i128 %cmp, i128 %new monotonic monotonic + ret i128 %new +} + +define i128 @test_cmpxchg_block(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_block( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_block_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_block_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_block_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.relaxed.cta.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new syncscope("block") monotonic monotonic + ret i128 %new +} + +define i128 @test_cmpxchg_cluster(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_cluster( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_cluster_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_cluster_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_cluster_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.relaxed.cluster.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new syncscope("cluster") monotonic monotonic + ret i128 %new +} + +define i128 @test_cmpxchg_gpu(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_gpu( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_gpu_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_gpu_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_gpu_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.relaxed.gpu.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new syncscope("device") monotonic monotonic + ret i128 %new +} + +define i128 @test_cmpxchg_shared_cluster(ptr addrspace(7) %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_shared_cluster( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_shared_cluster_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_shared_cluster_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_shared_cluster_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr addrspace(7) %addr, i128 %cmp, i128 %new monotonic monotonic + ret i128 %new +} + +define i128 @test_cmpxchg_monotonic_monotonic(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_monotonic_monotonic( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_monotonic_monotonic_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_monotonic_monotonic_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_monotonic_monotonic_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.relaxed.sys.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new monotonic monotonic + ret i128 %new +} + +define i128 @test_cmpxchg_monotonic_acquire(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_monotonic_acquire( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_monotonic_acquire_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_monotonic_acquire_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_monotonic_acquire_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.acquire.sys.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new monotonic acquire + ret i128 %new +} + +define i128 @test_cmpxchg_monotonic_seq_cst(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_monotonic_seq_cst( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_monotonic_seq_cst_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_monotonic_seq_cst_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_monotonic_seq_cst_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.acquire.sys.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new monotonic seq_cst + ret i128 %new +} + +define i128 @test_cmpxchg_acquire_monotonic(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_acquire_monotonic( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_acquire_monotonic_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_acquire_monotonic_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_acquire_monotonic_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.acquire.sys.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new acquire monotonic + ret i128 %new +} + +define i128 @test_cmpxchg_acquire_acquire(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_acquire_acquire( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_acquire_acquire_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_acquire_acquire_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_acquire_acquire_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.acquire.sys.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new acquire acquire + ret i128 %new +} + +define i128 @test_cmpxchg_acquire_seq_cst(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_acquire_seq_cst( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_acquire_seq_cst_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_acquire_seq_cst_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_acquire_seq_cst_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.acquire.sys.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new acquire seq_cst + ret i128 %new +} + +define i128 @test_cmpxchg_release_monotonic(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_release_monotonic( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_release_monotonic_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_release_monotonic_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_release_monotonic_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.release.sys.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new release monotonic + ret i128 %new +} + +define i128 @test_cmpxchg_release_acquire(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_release_acquire( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_release_acquire_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_release_acquire_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_release_acquire_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.acq_rel.sys.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new release acquire + ret i128 %new +} + +define i128 @test_cmpxchg_release_seq_cst(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_release_seq_cst( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_release_seq_cst_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_release_seq_cst_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_release_seq_cst_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.acquire.sys.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new release seq_cst + ret i128 %new +} + +define i128 @test_cmpxchg_acq_rel_monotonic(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_acq_rel_monotonic( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_acq_rel_monotonic_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_acq_rel_monotonic_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_acq_rel_monotonic_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.acq_rel.sys.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new acq_rel monotonic + ret i128 %new +} + +define i128 @test_cmpxchg_acq_rel_acquire(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_acq_rel_acquire( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_acq_rel_acquire_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_acq_rel_acquire_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_acq_rel_acquire_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.acq_rel.sys.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new acq_rel acquire + ret i128 %new +} + +define i128 @test_cmpxchg_acq_rel_seq_cst(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_acq_rel_seq_cst( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_acq_rel_seq_cst_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_acq_rel_seq_cst_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_acq_rel_seq_cst_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.acquire.sys.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new acq_rel seq_cst + ret i128 %new +} + +define i128 @test_cmpxchg_seq_cst_monotonic(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_seq_cst_monotonic( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_seq_cst_monotonic_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_seq_cst_monotonic_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_seq_cst_monotonic_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.acquire.sys.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new seq_cst monotonic + ret i128 %new +} + +define i128 @test_cmpxchg_seq_cst_acquire(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_seq_cst_acquire( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_seq_cst_acquire_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_seq_cst_acquire_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_seq_cst_acquire_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.acquire.sys.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new seq_cst acquire + ret i128 %new +} + +define i128 @test_cmpxchg_seq_cst_seq_cst(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_seq_cst_seq_cst( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_seq_cst_seq_cst_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_seq_cst_seq_cst_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_seq_cst_seq_cst_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.acquire.sys.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new seq_cst seq_cst + ret i128 %new +} + +define i128 @test_atomicrmw_and(ptr %ptr, i128 %val) { +; CHECK-LABEL: test_atomicrmw_and( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b64 %rd<13>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_atomicrmw_and_param_1]; +; CHECK-NEXT: ld.param.b64 %rd3, [test_atomicrmw_and_param_0]; +; CHECK-NEXT: ld.v2.b64 {%rd11, %rd12}, [%rd3]; +; CHECK-NEXT: $L__BB34_1: // %atomicrmw.start +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: and.b64 %rd6, %rd11, %rd4; +; CHECK-NEXT: and.b64 %rd7, %rd12, %rd5; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd11, %rd12}; +; CHECK-NEXT: mov.b128 swap, {%rd6, %rd7}; +; CHECK-NEXT: atom.relaxed.sys.cas.b128 dst, [%rd3], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd1, %rd2}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: xor.b64 %rd8, %rd2, %rd12; +; CHECK-NEXT: xor.b64 %rd9, %rd1, %rd11; +; CHECK-NEXT: or.b64 %rd10, %rd9, %rd8; +; CHECK-NEXT: setp.ne.b64 %p1, %rd10, 0; +; CHECK-NEXT: mov.b64 %rd11, %rd1; +; CHECK-NEXT: mov.b64 %rd12, %rd2; +; CHECK-NEXT: @%p1 bra $L__BB34_1; +; CHECK-NEXT: // %bb.2: // %atomicrmw.end +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: ret; + %ret = atomicrmw and ptr %ptr, i128 %val monotonic + ret i128 %ret +} + +define i128 @test_atomicrmw_or(ptr %ptr, i128 %val) { +; CHECK-LABEL: test_atomicrmw_or( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b64 %rd<13>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_atomicrmw_or_param_1]; +; CHECK-NEXT: ld.param.b64 %rd3, [test_atomicrmw_or_param_0]; +; CHECK-NEXT: ld.v2.b64 {%rd11, %rd12}, [%rd3]; +; CHECK-NEXT: $L__BB35_1: // %atomicrmw.start +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: or.b64 %rd6, %rd11, %rd4; +; CHECK-NEXT: or.b64 %rd7, %rd12, %rd5; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd11, %rd12}; +; CHECK-NEXT: mov.b128 swap, {%rd6, %rd7}; +; CHECK-NEXT: atom.relaxed.sys.cas.b128 dst, [%rd3], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd1, %rd2}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: xor.b64 %rd8, %rd2, %rd12; +; CHECK-NEXT: xor.b64 %rd9, %rd1, %rd11; +; CHECK-NEXT: or.b64 %rd10, %rd9, %rd8; +; CHECK-NEXT: setp.ne.b64 %p1, %rd10, 0; +; CHECK-NEXT: mov.b64 %rd11, %rd1; +; CHECK-NEXT: mov.b64 %rd12, %rd2; +; CHECK-NEXT: @%p1 bra $L__BB35_1; +; CHECK-NEXT: // %bb.2: // %atomicrmw.end +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: ret; + %ret = atomicrmw or ptr %ptr, i128 %val monotonic + ret i128 %ret +} + +define i128 @test_atomicrmw_xor(ptr %ptr, i128 %val) { +; CHECK-LABEL: test_atomicrmw_xor( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b64 %rd<13>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_atomicrmw_xor_param_1]; +; CHECK-NEXT: ld.param.b64 %rd3, [test_atomicrmw_xor_param_0]; +; CHECK-NEXT: ld.v2.b64 {%rd11, %rd12}, [%rd3]; +; CHECK-NEXT: $L__BB36_1: // %atomicrmw.start +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: xor.b64 %rd6, %rd11, %rd4; +; CHECK-NEXT: xor.b64 %rd7, %rd12, %rd5; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd11, %rd12}; +; CHECK-NEXT: mov.b128 swap, {%rd6, %rd7}; +; CHECK-NEXT: atom.relaxed.sys.cas.b128 dst, [%rd3], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd1, %rd2}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: xor.b64 %rd8, %rd2, %rd12; +; CHECK-NEXT: xor.b64 %rd9, %rd1, %rd11; +; CHECK-NEXT: or.b64 %rd10, %rd9, %rd8; +; CHECK-NEXT: setp.ne.b64 %p1, %rd10, 0; +; CHECK-NEXT: mov.b64 %rd11, %rd1; +; CHECK-NEXT: mov.b64 %rd12, %rd2; +; CHECK-NEXT: @%p1 bra $L__BB36_1; +; CHECK-NEXT: // %bb.2: // %atomicrmw.end +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: ret; + %ret = atomicrmw xor ptr %ptr, i128 %val monotonic + ret i128 %ret +} + +define i128 @test_atomicrmw_min(ptr %ptr, i128 %val) { +; CHECK-LABEL: test_atomicrmw_min( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<7>; +; CHECK-NEXT: .reg .b64 %rd<13>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_atomicrmw_min_param_1]; +; CHECK-NEXT: ld.param.b64 %rd3, [test_atomicrmw_min_param_0]; +; CHECK-NEXT: ld.v2.b64 {%rd11, %rd12}, [%rd3]; +; CHECK-NEXT: $L__BB37_1: // %atomicrmw.start +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: setp.lt.u64 %p1, %rd11, %rd4; +; CHECK-NEXT: setp.eq.b64 %p2, %rd12, %rd5; +; CHECK-NEXT: and.pred %p3, %p2, %p1; +; CHECK-NEXT: setp.lt.s64 %p4, %rd12, %rd5; +; CHECK-NEXT: or.pred %p5, %p3, %p4; +; CHECK-NEXT: selp.b64 %rd6, %rd12, %rd5, %p5; +; CHECK-NEXT: selp.b64 %rd7, %rd11, %rd4, %p5; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd11, %rd12}; +; CHECK-NEXT: mov.b128 swap, {%rd7, %rd6}; +; CHECK-NEXT: atom.relaxed.sys.cas.b128 dst, [%rd3], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd1, %rd2}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: xor.b64 %rd8, %rd2, %rd12; +; CHECK-NEXT: xor.b64 %rd9, %rd1, %rd11; +; CHECK-NEXT: or.b64 %rd10, %rd9, %rd8; +; CHECK-NEXT: setp.ne.b64 %p6, %rd10, 0; +; CHECK-NEXT: mov.b64 %rd11, %rd1; +; CHECK-NEXT: mov.b64 %rd12, %rd2; +; CHECK-NEXT: @%p6 bra $L__BB37_1; +; CHECK-NEXT: // %bb.2: // %atomicrmw.end +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: ret; + %ret = atomicrmw min ptr %ptr, i128 %val monotonic + ret i128 %ret +} + +define i128 @test_atomicrmw_max(ptr %ptr, i128 %val) { +; CHECK-LABEL: test_atomicrmw_max( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<7>; +; CHECK-NEXT: .reg .b64 %rd<13>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_atomicrmw_max_param_1]; +; CHECK-NEXT: ld.param.b64 %rd3, [test_atomicrmw_max_param_0]; +; CHECK-NEXT: ld.v2.b64 {%rd11, %rd12}, [%rd3]; +; CHECK-NEXT: $L__BB38_1: // %atomicrmw.start +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: setp.gt.u64 %p1, %rd11, %rd4; +; CHECK-NEXT: setp.eq.b64 %p2, %rd12, %rd5; +; CHECK-NEXT: and.pred %p3, %p2, %p1; +; CHECK-NEXT: setp.gt.s64 %p4, %rd12, %rd5; +; CHECK-NEXT: or.pred %p5, %p3, %p4; +; CHECK-NEXT: selp.b64 %rd6, %rd12, %rd5, %p5; +; CHECK-NEXT: selp.b64 %rd7, %rd11, %rd4, %p5; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd11, %rd12}; +; CHECK-NEXT: mov.b128 swap, {%rd7, %rd6}; +; CHECK-NEXT: atom.relaxed.sys.cas.b128 dst, [%rd3], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd1, %rd2}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: xor.b64 %rd8, %rd2, %rd12; +; CHECK-NEXT: xor.b64 %rd9, %rd1, %rd11; +; CHECK-NEXT: or.b64 %rd10, %rd9, %rd8; +; CHECK-NEXT: setp.ne.b64 %p6, %rd10, 0; +; CHECK-NEXT: mov.b64 %rd11, %rd1; +; CHECK-NEXT: mov.b64 %rd12, %rd2; +; CHECK-NEXT: @%p6 bra $L__BB38_1; +; CHECK-NEXT: // %bb.2: // %atomicrmw.end +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: ret; + %ret = atomicrmw max ptr %ptr, i128 %val monotonic + ret i128 %ret +} + +define i128 @test_atomicrmw_umin(ptr %ptr, i128 %val) { +; CHECK-LABEL: test_atomicrmw_umin( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<7>; +; CHECK-NEXT: .reg .b64 %rd<13>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_atomicrmw_umin_param_1]; +; CHECK-NEXT: ld.param.b64 %rd3, [test_atomicrmw_umin_param_0]; +; CHECK-NEXT: ld.v2.b64 {%rd11, %rd12}, [%rd3]; +; CHECK-NEXT: $L__BB39_1: // %atomicrmw.start +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: setp.lt.u64 %p1, %rd11, %rd4; +; CHECK-NEXT: setp.eq.b64 %p2, %rd12, %rd5; +; CHECK-NEXT: and.pred %p3, %p2, %p1; +; CHECK-NEXT: setp.lt.u64 %p4, %rd12, %rd5; +; CHECK-NEXT: or.pred %p5, %p3, %p4; +; CHECK-NEXT: selp.b64 %rd6, %rd12, %rd5, %p5; +; CHECK-NEXT: selp.b64 %rd7, %rd11, %rd4, %p5; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd11, %rd12}; +; CHECK-NEXT: mov.b128 swap, {%rd7, %rd6}; +; CHECK-NEXT: atom.relaxed.sys.cas.b128 dst, [%rd3], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd1, %rd2}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: xor.b64 %rd8, %rd2, %rd12; +; CHECK-NEXT: xor.b64 %rd9, %rd1, %rd11; +; CHECK-NEXT: or.b64 %rd10, %rd9, %rd8; +; CHECK-NEXT: setp.ne.b64 %p6, %rd10, 0; +; CHECK-NEXT: mov.b64 %rd11, %rd1; +; CHECK-NEXT: mov.b64 %rd12, %rd2; +; CHECK-NEXT: @%p6 bra $L__BB39_1; +; CHECK-NEXT: // %bb.2: // %atomicrmw.end +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: ret; + %ret = atomicrmw umin ptr %ptr, i128 %val monotonic + ret i128 %ret +} + +define i128 @test_atomicrmw_umax(ptr %ptr, i128 %val) { +; CHECK-LABEL: test_atomicrmw_umax( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<7>; +; CHECK-NEXT: .reg .b64 %rd<13>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_atomicrmw_umax_param_1]; +; CHECK-NEXT: ld.param.b64 %rd3, [test_atomicrmw_umax_param_0]; +; CHECK-NEXT: ld.v2.b64 {%rd11, %rd12}, [%rd3]; +; CHECK-NEXT: $L__BB40_1: // %atomicrmw.start +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: setp.gt.u64 %p1, %rd11, %rd4; +; CHECK-NEXT: setp.eq.b64 %p2, %rd12, %rd5; +; CHECK-NEXT: and.pred %p3, %p2, %p1; +; CHECK-NEXT: setp.gt.u64 %p4, %rd12, %rd5; +; CHECK-NEXT: or.pred %p5, %p3, %p4; +; CHECK-NEXT: selp.b64 %rd6, %rd12, %rd5, %p5; +; CHECK-NEXT: selp.b64 %rd7, %rd11, %rd4, %p5; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd11, %rd12}; +; CHECK-NEXT: mov.b128 swap, {%rd7, %rd6}; +; CHECK-NEXT: atom.relaxed.sys.cas.b128 dst, [%rd3], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd1, %rd2}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: xor.b64 %rd8, %rd2, %rd12; +; CHECK-NEXT: xor.b64 %rd9, %rd1, %rd11; +; CHECK-NEXT: or.b64 %rd10, %rd9, %rd8; +; CHECK-NEXT: setp.ne.b64 %p6, %rd10, 0; +; CHECK-NEXT: mov.b64 %rd11, %rd1; +; CHECK-NEXT: mov.b64 %rd12, %rd2; +; CHECK-NEXT: @%p6 bra $L__BB40_1; +; CHECK-NEXT: // %bb.2: // %atomicrmw.end +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: ret; + %ret = atomicrmw umax ptr %ptr, i128 %val monotonic + ret i128 %ret +}