-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[NVPTX] add support for 128-bit atomics #154852
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-backend-nvptx Author: Alex MacLean (AlexMaclean) ChangesPatch is 54.19 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/154852.diff 7 Files Affected:
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 3300ed9a5a81c..2143019f4923b 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -170,6 +170,10 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
}
break;
}
+ case NVPTXISD::ATOMIC_CMP_SWAP_B128:
+ case NVPTXISD::ATOMIC_SWAP_B128:
+ selectAtomic128(N);
+ return;
case ISD::FADD:
case ISD::FMUL:
case ISD::FSUB:
@@ -2337,3 +2341,28 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
}
}
}
+
+void NVPTXDAGToDAGISel::selectAtomic128(SDNode *N) {
+ MemSDNode *AN = cast<MemSDNode>(N);
+ SDLoc dl(N);
+
+ const auto [Base, Offset] = selectADDR(N->getOperand(1), CurDAG);
+ SmallVector<SDValue, 5> Ops{Base, Offset};
+ Ops.append(N->op_begin() + 2, N->op_end());
+ Ops.append({
+ getI32Imm(getMemOrder(AN), dl),
+ getI32Imm(getAtomicScope(AN), dl),
+ getI32Imm(getAddrSpace(AN), dl),
+ });
+
+ assert(N->getOpcode() == NVPTXISD::ATOMIC_CMP_SWAP_B128 ||
+ N->getOpcode() == NVPTXISD::ATOMIC_SWAP_B128);
+ unsigned Opcode = N->getOpcode() == NVPTXISD::ATOMIC_SWAP_B128
+ ? NVPTX::ATOM_EXCH_B128
+ : NVPTX::ATOM_CAS_B128;
+
+ auto *ATOM = CurDAG->getMachineNode(Opcode, dl, N->getVTList(), Ops);
+ CurDAG->setNodeMemRefs(ATOM, AN->getMemOperand());
+
+ ReplaceNode(N, ATOM);
+}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index e2ad55bc1796d..b5a4bedfe1101 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -90,6 +90,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
bool IsIm2Col = false);
void SelectTcgen05Ld(SDNode *N, bool hasOffset = false);
void SelectTcgen05St(SDNode *N, bool hasOffset = false);
+ void selectAtomic128(SDNode *N);
inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 488f049dc2b3d..2746e997a44a0 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -1026,7 +1026,11 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
setOperationAction(ISD::ATOMIC_LOAD_SUB, {MVT::i32, MVT::i64}, Expand);
- // No FPOW or FREM in PTX.
+
+ // atom.b128 is legal in PTX but since we don't represent i128 as a legal
+ // type, we need to custom lower it.
+ setOperationAction({ISD::ATOMIC_CMP_SWAP, ISD::ATOMIC_SWAP}, MVT::i128,
+ Custom);
// Now deduce the information based on the above mentioned
// actions
@@ -1034,7 +1038,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// PTX support for 16-bit CAS is emulated. Only use 32+
setMinCmpXchgSizeInBits(STI.getMinCmpXchgSizeInBits());
- setMaxAtomicSizeInBitsSupported(64);
+ setMaxAtomicSizeInBitsSupported(128);
setMaxDivRemBitWidthSupported(64);
// Custom lowering for tcgen05.ld vector operands
@@ -1067,6 +1071,8 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
case NVPTXISD::FIRST_NUMBER:
break;
+ MAKE_CASE(NVPTXISD::ATOMIC_CMP_SWAP_B128)
+ MAKE_CASE(NVPTXISD::ATOMIC_SWAP_B128)
MAKE_CASE(NVPTXISD::RET_GLUE)
MAKE_CASE(NVPTXISD::DeclareArrayParam)
MAKE_CASE(NVPTXISD::DeclareScalarParam)
@@ -6226,6 +6232,49 @@ static void replaceProxyReg(SDNode *N, SelectionDAG &DAG,
Results.push_back(Res);
}
+static void replaceAtomic128(SDNode *N, SelectionDAG &DAG,
+ const NVPTXSubtarget &STI,
+ SmallVectorImpl<SDValue> &Results) {
+ assert(N->getValueType(0) == MVT::i128 &&
+ "Custom lowering for atomic128 only supports i128");
+
+ AtomicSDNode *AN = cast<AtomicSDNode>(N);
+ SDLoc dl(N);
+
+ if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 83) {
+ DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
+ DAG.getMachineFunction().getFunction(),
+ "Support for b128 atomics introduced in PTX ISA version 8.3 and "
+ "requires target sm_90.",
+ dl.getDebugLoc()));
+
+ Results.push_back(DAG.getUNDEF(MVT::i128));
+ Results.push_back(AN->getOperand(0)); // Chain
+ return;
+ }
+
+ SmallVector<SDValue, 6> Ops;
+ Ops.push_back(AN->getOperand(0)); // Chain
+ Ops.push_back(AN->getOperand(1)); // Ptr
+ for (const auto &Op : AN->ops().drop_front(2)) {
+ // Low part
+ Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i64, Op,
+ DAG.getIntPtrConstant(0, dl)));
+ // High part
+ Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i64, Op,
+ DAG.getIntPtrConstant(1, dl)));
+ }
+ unsigned Opcode = N->getOpcode() == ISD::ATOMIC_SWAP
+ ? NVPTXISD::ATOMIC_SWAP_B128
+ : NVPTXISD::ATOMIC_CMP_SWAP_B128;
+ SDVTList Tys = DAG.getVTList(MVT::i64, MVT::i64, MVT::Other);
+ SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, MVT::i128,
+ AN->getMemOperand());
+ Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i128,
+ {Result.getValue(0), Result.getValue(1)}));
+ Results.push_back(Result.getValue(2));
+}
+
void NVPTXTargetLowering::ReplaceNodeResults(
SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
switch (N->getOpcode()) {
@@ -6246,6 +6295,10 @@ void NVPTXTargetLowering::ReplaceNodeResults(
case NVPTXISD::ProxyReg:
replaceProxyReg(N, DAG, *this, Results);
return;
+ case ISD::ATOMIC_CMP_SWAP:
+ case ISD::ATOMIC_SWAP:
+ replaceAtomic128(N, DAG, STI, Results);
+ return;
}
}
@@ -6270,16 +6323,19 @@ NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
}
assert(Ty->isIntegerTy() && "Ty should be integer at this point");
- auto ITy = cast<llvm::IntegerType>(Ty);
+ const unsigned BitWidth = cast<IntegerType>(Ty)->getBitWidth();
switch (AI->getOperation()) {
default:
return AtomicExpansionKind::CmpXChg;
+ case AtomicRMWInst::BinOp::Xchg:
+ if (BitWidth == 128)
+ return AtomicExpansionKind::None;
+ LLVM_FALLTHROUGH;
case AtomicRMWInst::BinOp::And:
case AtomicRMWInst::BinOp::Or:
case AtomicRMWInst::BinOp::Xor:
- case AtomicRMWInst::BinOp::Xchg:
- switch (ITy->getBitWidth()) {
+ switch (BitWidth) {
case 8:
case 16:
return AtomicExpansionKind::CmpXChg;
@@ -6289,6 +6345,8 @@ NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
if (STI.hasAtomBitwise64())
return AtomicExpansionKind::None;
return AtomicExpansionKind::CmpXChg;
+ case 128:
+ return AtomicExpansionKind::CmpXChg;
default:
llvm_unreachable("unsupported width encountered");
}
@@ -6298,7 +6356,7 @@ NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
case AtomicRMWInst::BinOp::Min:
case AtomicRMWInst::BinOp::UMax:
case AtomicRMWInst::BinOp::UMin:
- switch (ITy->getBitWidth()) {
+ switch (BitWidth) {
case 8:
case 16:
return AtomicExpansionKind::CmpXChg;
@@ -6308,17 +6366,20 @@ NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
if (STI.hasAtomMinMax64())
return AtomicExpansionKind::None;
return AtomicExpansionKind::CmpXChg;
+ case 128:
+ return AtomicExpansionKind::CmpXChg;
default:
llvm_unreachable("unsupported width encountered");
}
case AtomicRMWInst::BinOp::UIncWrap:
case AtomicRMWInst::BinOp::UDecWrap:
- switch (ITy->getBitWidth()) {
+ switch (BitWidth) {
case 32:
return AtomicExpansionKind::None;
case 8:
case 16:
case 64:
+ case 128:
return AtomicExpansionKind::CmpXChg;
default:
llvm_unreachable("unsupported width encountered");
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index e7f1a4b4c98c4..80d2f626767ea 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -81,7 +81,17 @@ enum NodeType : unsigned {
CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Z,
FIRST_MEMORY_OPCODE,
- LoadV2 = FIRST_MEMORY_OPCODE,
+
+ /// These nodes are used to lower atomic instructions with i128 type. They are
+ /// similar to the generic nodes, but the input and output values are split
+ /// into two 64-bit values.
+ /// ValLo, ValHi, OUTCHAIN = ATOMIC_CMP_SWAP_B128(INCHAIN, ptr, cmpLo, cmpHi,
+ /// swapLo, swapHi)
+ /// ValLo, ValHi, OUTCHAIN = ATOMIC_SWAP_B128(INCHAIN, ptr, amtLo, amtHi)
+ ATOMIC_CMP_SWAP_B128 = FIRST_MEMORY_OPCODE,
+ ATOMIC_SWAP_B128,
+
+ LoadV2,
LoadV4,
LoadV8,
LDUV2, // LDU.v2
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 721afae4db51c..d5c8cf5f209f9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1990,19 +1990,23 @@ multiclass F_ATOMIC_3<RegTyInfo t, string op_str, SDPatternOperator op, SDNode a
let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in {
def _rr : BasicFlagsNVPTXInst<(outs t.RC:$dst),
- (ins ADDR:$addr, t.RC:$b, t.RC:$c), (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp),
+ (ins ADDR:$addr, t.RC:$b, t.RC:$c),
+ (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp),
asm_str>;
def _ir : BasicFlagsNVPTXInst<(outs t.RC:$dst),
- (ins ADDR:$addr, t.Imm:$b, t.RC:$c), (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp),
+ (ins ADDR:$addr, t.Imm:$b, t.RC:$c),
+ (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp),
asm_str>;
def _ri : BasicFlagsNVPTXInst<(outs t.RC:$dst),
- (ins ADDR:$addr, t.RC:$b, t.Imm:$c), (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp),
+ (ins ADDR:$addr, t.RC:$b, t.Imm:$c),
+ (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp),
asm_str>;
def _ii : BasicFlagsNVPTXInst<(outs t.RC:$dst),
- (ins ADDR:$addr, t.Imm:$b, t.Imm:$c), (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp),
+ (ins ADDR:$addr, t.Imm:$b, t.Imm:$c),
+ (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp),
asm_str>;
}
@@ -2200,6 +2204,36 @@ defm INT_PTX_SATOM_MIN : ATOM2_minmax_impl<"min">;
defm INT_PTX_SATOM_OR : ATOM2_bitwise_impl<"or">;
defm INT_PTX_SATOM_XOR : ATOM2_bitwise_impl<"xor">;
+// atom.*.b128
+
+let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in {
+ def ATOM_CAS_B128 :
+ NVPTXInst<
+ (outs B64:$dst0, B64:$dst1),
+ (ins ADDR:$addr, B64:$cmp0, B64:$cmp1, B64:$swap0, B64:$swap1,
+ AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp),
+ "{{\n\t"
+ ".reg .b128 src1, src2, dst;\n\t"
+ "mov.b128 src1, {$cmp0, $cmp1};\n\t"
+ "mov.b128 src2, {$swap0, $swap1};\n\t"
+ "atom${sem:sem}${scope:scope}${addsp:addsp}.cas.b128 dst, $addr, src1, src2;\n\t"
+ "mov.b128 {$dst0, $dst1}, dst;\n\t"
+ "}}">;
+
+ def ATOM_EXCH_B128 :
+ NVPTXInst<
+ (outs B64:$dst0, B64:$dst1),
+ (ins ADDR:$addr, B64:$amt0, B64:$amt1,
+ AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp),
+ "{{\n\t"
+ ".reg .b128 src1, dst;\n\t"
+ "mov.b128 src1, {$amt0, $amt1};\n\t"
+ "atom${sem:sem}${scope:scope}${addsp:addsp}.exch.b128 dst, $addr, src1;\n\t"
+ "mov.b128 {$dst0, $dst1}, dst;\n\t"
+ "}}">;
+}
+
+
//-----------------------------------
// Support for ldu on sm_20 or later
//-----------------------------------
diff --git a/llvm/test/CodeGen/NVPTX/atomicrmw-expand.err.ll b/llvm/test/CodeGen/NVPTX/atomicrmw-expand.err.ll
index b19f6d56b9a91..392cd8b26d27e 100644
--- a/llvm/test/CodeGen/NVPTX/atomicrmw-expand.err.ll
+++ b/llvm/test/CodeGen/NVPTX/atomicrmw-expand.err.ll
@@ -4,12 +4,12 @@
; CHECK: error: unsupported cmpxchg
; CHECK: error: unsupported cmpxchg
; CHECK: error: unsupported cmpxchg
-define void @bitwise_i128(ptr %0, i128 %1) {
+define void @bitwise_i256(ptr %0, i256 %1) {
entry:
- %2 = atomicrmw and ptr %0, i128 %1 monotonic, align 16
- %3 = atomicrmw or ptr %0, i128 %1 monotonic, align 16
- %4 = atomicrmw xor ptr %0, i128 %1 monotonic, align 16
- %5 = atomicrmw xchg ptr %0, i128 %1 monotonic, align 16
+ %2 = atomicrmw and ptr %0, i256 %1 monotonic, align 16
+ %3 = atomicrmw or ptr %0, i256 %1 monotonic, align 16
+ %4 = atomicrmw xor ptr %0, i256 %1 monotonic, align 16
+ %5 = atomicrmw xchg ptr %0, i256 %1 monotonic, align 16
ret void
}
@@ -17,11 +17,11 @@ entry:
; CHECK: error: unsupported cmpxchg
; CHECK: error: unsupported cmpxchg
; CHECK: error: unsupported cmpxchg
-define void @minmax_i128(ptr %0, i128 %1) {
+define void @minmax_i256(ptr %0, i256 %1) {
entry:
- %2 = atomicrmw min ptr %0, i128 %1 monotonic, align 16
- %3 = atomicrmw max ptr %0, i128 %1 monotonic, align 16
- %4 = atomicrmw umin ptr %0, i128 %1 monotonic, align 16
- %5 = atomicrmw umax ptr %0, i128 %1 monotonic, align 16
+ %2 = atomicrmw min ptr %0, i256 %1 monotonic, align 16
+ %3 = atomicrmw max ptr %0, i256 %1 monotonic, align 16
+ %4 = atomicrmw umin ptr %0, i256 %1 monotonic, align 16
+ %5 = atomicrmw umax ptr %0, i256 %1 monotonic, align 16
ret void
}
diff --git a/llvm/test/CodeGen/NVPTX/atomics-b128.ll b/llvm/test/CodeGen/NVPTX/atomics-b128.ll
new file mode 100644
index 0000000000000..7d14a1eb38250
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/atomics-b128.ll
@@ -0,0 +1,1000 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: not llc < %s -mcpu=sm_90 -mattr=+ptx82 2>&1 | FileCheck %s --check-prefix=ERROR
+; RUN: not llc < %s -mcpu=sm_80 -mattr=+ptx84 2>&1 | FileCheck %s --check-prefix=ERROR
+; RUN: llc < %s -mcpu=sm_90 -mattr=+ptx84 | FileCheck %s --check-prefix=CHECK
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx84 | %ptxas-verify -arch=sm_90 %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+;; Check that the first couple of error messages are correct.
+; ERROR: error: <unknown>:0:0: in function test_xchg_generic i128 (ptr, i128): Support for b128 atomics introduced in PTX ISA version 8.3 and requires target sm_90.
+; ERROR: error: <unknown>:0:0: in function test_xchg_global i128 (ptr addrspace(1), i128): Support for b128 atomics introduced in PTX ISA version 8.3 and requires target sm_90.
+
+define i128 @test_xchg_generic(ptr %addr, i128 %amt) {
+; CHECK-LABEL: test_xchg_generic(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_xchg_generic_param_0];
+; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_xchg_generic_param_1];
+; CHECK-NEXT: {
+; CHECK-NEXT: .reg .b128 src1, dst;
+; CHECK-NEXT: mov.b128 src1, {%rd2, %rd3};
+; CHECK-NEXT: atom.release.sys.exch.b128 dst, %rd1, src1;
+; CHECK-NEXT: mov.b128 {%rd4, %rd5}, dst;
+; CHECK-NEXT: }
+; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5};
+; CHECK-NEXT: ret;
+ %old = atomicrmw xchg ptr %addr, i128 %amt release
+ ret i128 %old
+}
+
+define i128 @test_xchg_global(ptr addrspace(1) %addr, i128 %amt) {
+; CHECK-LABEL: test_xchg_global(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_xchg_global_param_0];
+; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_xchg_global_param_1];
+; CHECK-NEXT: {
+; CHECK-NEXT: .reg .b128 src1, dst;
+; CHECK-NEXT: mov.b128 src1, {%rd2, %rd3};
+; CHECK-NEXT: atom.release.sys.global.exch.b128 dst, %rd1, src1;
+; CHECK-NEXT: mov.b128 {%rd4, %rd5}, dst;
+; CHECK-NEXT: }
+; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5};
+; CHECK-NEXT: ret;
+ %old = atomicrmw xchg ptr addrspace(1) %addr, i128 %amt release
+ ret i128 %old
+}
+
+define i128 @test_xchg_shared(ptr addrspace(3) %addr, i128 %amt) {
+; CHECK-LABEL: test_xchg_shared(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_xchg_shared_param_0];
+; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_xchg_shared_param_1];
+; CHECK-NEXT: {
+; CHECK-NEXT: .reg .b128 src1, dst;
+; CHECK-NEXT: mov.b128 src1, {%rd2, %rd3};
+; CHECK-NEXT: atom.release.sys.shared.exch.b128 dst, %rd1, src1;
+; CHECK-NEXT: mov.b128 {%rd4, %rd5}, dst;
+; CHECK-NEXT: }
+; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5};
+; CHECK-NEXT: ret;
+ %old = atomicrmw xchg ptr addrspace(3) %addr, i128 %amt release
+ ret i128 %old
+}
+
+define i128 @test_xchg_shared_cluster(ptr addrspace(7) %addr, i128 %amt) {
+; CHECK-LABEL: test_xchg_shared_cluster(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_xchg_shared_cluster_param_0];
+; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_xchg_shared_cluster_param_1];
+; CHECK-NEXT: {
+; CHECK-NEXT: .reg .b128 src1, dst;
+; CHECK-NEXT: mov.b128 src1, {%rd2, %rd3};
+; CHECK-NEXT: atom.release.sys.shared::cluster.exch.b128 dst, %rd1, src1;
+; CHECK-NEXT: mov.b128 {%rd4, %rd5}, dst;
+; CHECK-NEXT: }
+; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5};
+; CHECK-NEXT: ret;
+ %old = atomicrmw xchg ptr addrspace(7) %addr, i128 %amt release
+ ret i128 %old
+}
+
+define i128 @test_xchg_block(ptr %addr, i128 %amt) {
+; CHECK-LABEL: test_xchg_block(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_xchg_block_param_0];
+; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_xchg_block_param_1];
+; CHECK-NEXT: {
+; CHECK-NEXT: .reg .b128 src1, dst;
+; CHECK-NEXT: mov.b128 src1, {%rd2, %rd3};
+; CHECK-NEXT: atom.release.cta.exch.b128 dst, %rd1, src1;
+; CHECK-NEXT: mov.b128 {%rd4, %rd5}, dst;
+; CHECK-NEXT: }
+; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5};
+; CHECK-NEXT: ret;
+ %old = atomicrmw xchg ptr %addr, i128 %amt syncscope("block") release
+ ret i128 %old
+}
+
+define i128 @test_xchg_cluster(ptr %addr, i128 %amt) {
+; CHECK-LABEL: test_xchg_cluster(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_xchg_cluster_param_0];
+; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_xchg_cluster_param_1];
+; CHECK-NEXT: {
+; CHECK-NEXT: .reg .b128 src1, dst;
+; CHECK-NEXT: mov.b128 src1, {%rd2, %rd3};
+; CHECK-NEXT: atom.release.cluster.exch.b128 dst, %rd1, src1;
+; CHECK-NEXT: mov.b128 {%rd4, %rd5}, dst;
+; CHECK-NEXT: }
+; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5};
+; CHECK-NEXT: ret;
+ %old = atomicrmw xchg ptr %addr, i128 %amt syncscope("cluster") release
+ ret i128 %old
+}
+
+define i128 @test_xchg_gpu(ptr %addr, i128 %amt) {
+; CHECK-LABEL: test_xchg_gpu(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_xchg_gpu_param_0];
+; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_xchg_gpu_param_1];
+; CHECK-NEXT: {
+; CHECK-NEXT: .reg .b128 src1, dst;
+; CHECK-NEXT: mov.b128 src1, {%rd2, %rd3};
+; CHECK-NEXT: atom.release.gpu.exch.b128 dst, %rd1, src1;
+; CHECK-NEXT: mov.b128 {%rd4, %rd5}, dst;
+; CHECK-NEXT: }
+; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5};
+; CHECK-NEXT: ret;
+ %old = atomicrmw xchg ptr %addr, i128 %amt syncscope("device") release
+ ret i128 %old
+}
+
+define i128 @test_xchg_sys(ptr %addr, i128 %amt) {
+; CHECK-LABEL: test_xchg_sys(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_xchg_sys_param_0];
+; CHECK-NEXT: ld....
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
replaceAtomicSwap128, and selectAtomicSwap128 maybe? The others are emulated.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Renamed!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add a note- to be integrated into script-generated atomicrmw tests in time.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The above comments are nits.
Which is the API that leads AtomicExpand to emulate everything other than 128-bit exchange?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
NVPTXTargetLowering::shouldExpandAtomicRMWInIR
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added a TODO for this
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
schwarzschild-radius
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we should have two tests, one for i128 and one for i256, and the i256 should always fail, but the i128 one should fail or pass depending on whether sm < 90 or not.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There is already coverage of i128 failing when unsupported in atomics-b128.ll. I think there is any reason to add that here as well.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd prefer for us to error and fail to legalize on these instead of relying on ptxas for that.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These error messages are us failing to lower these. ptxas is not running here and is not responsible for producing these error messages.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
After the update to setMaxAtomicSizeInBitsSupported these error messages are now emitted by AtomicExpandPass and read "error: unsupported cmpxchg"
|
This LGTM overall, thank you Alex. |
26d8b31 to
82238dc
Compare
Artem-B
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
82238dc to
fc82c80
Compare
No description provided.