From 3e4b9c3657d3f266fbdc75876a3c881f26704cb6 Mon Sep 17 00:00:00 2001 From: Alex Maclean Date: Wed, 30 Apr 2025 23:10:00 +0000 Subject: [PATCH 1/2] [NVPTX] Fixup EXT_LOAD lowering for i128 values --- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 13 ++++++++++ llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 27 +++++++++++--------- llvm/test/CodeGen/NVPTX/i128-ld-st.ll | 28 +++++++++++++++++++++ 3 files changed, 56 insertions(+), 12 deletions(-) create mode 100644 llvm/test/CodeGen/NVPTX/i128-ld-st.ll diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 295ed666a1902..5c41ac261224d 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -26,6 +26,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FormatVariadic.h" +#include "llvm/Support/MathExtras.h" #include using namespace llvm; @@ -1141,6 +1142,9 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { else FromType = getLdStRegType(ScalarVT); + assert(isPowerOf2_32(FromTypeWidth) && FromTypeWidth >= 8 && + FromTypeWidth <= 128 && "Invalid width for load"); + // Create the machine instruction DAG SDValue Offset, Base; SelectADDR(N->getOperand(1), Base, Offset); @@ -1236,6 +1240,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { FromType = NVPTX::PTXLdStInstCode::Untyped; } + assert(isPowerOf2_32(FromTypeWidth) && FromTypeWidth >= 8 && + FromTypeWidth <= 128 && TotalWidth <= 128 && "Invalid width for load"); + SDValue Offset, Base; SelectADDR(N->getOperand(1), Base, Offset); SDValue Ops[] = {getI32Imm(Ordering, DL), @@ -1453,6 +1460,9 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { // Create the machine instruction DAG SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal(); + assert(isPowerOf2_32(ToTypeWidth) && ToTypeWidth >= 8 && ToTypeWidth <= 128 && + "Invalid width for store"); + SDValue Offset, Base; SelectADDR(ST->getBasePtr(), Base, Offset); @@ -1537,6 +1547,9 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { ToType = NVPTX::PTXLdStInstCode::Untyped; } + assert(isPowerOf2_32(ToTypeWidth) && ToTypeWidth >= 8 && ToTypeWidth <= 128 && + TotalWidth <= 128 && "Invalid width for store"); + SDValue Offset, Base; SelectADDR(N2, Base, Offset); diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index c41741ed10232..2f24667cb3cde 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -3191,20 +3191,22 @@ SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { SDValue NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { - SDNode *N = Op.getNode(); + MemSDNode *N = cast(Op.getNode()); SDValue Val = N->getOperand(1); SDLoc DL(N); - EVT ValVT = Val.getValueType(); + const EVT ValVT = Val.getValueType(); + const EVT MemVT = N->getMemoryVT(); + if (ValVT != MemVT) + return SDValue(); const auto NumEltsAndEltVT = getVectorLoweringShape(ValVT); if (!NumEltsAndEltVT) return SDValue(); const auto [NumElts, EltVT] = NumEltsAndEltVT.value(); - MemSDNode *MemSD = cast(N); const DataLayout &TD = DAG.getDataLayout(); - Align Alignment = MemSD->getAlign(); + Align Alignment = N->getAlign(); Align PrefAlign = TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext())); if (Alignment < PrefAlign) { // This store is not sufficiently aligned, so bail out and let this vector @@ -3267,7 +3269,7 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { SDValue NewSt = DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops, - MemSD->getMemoryVT(), MemSD->getMemOperand()); + N->getMemoryVT(), N->getMemOperand()); // return DCI.CombineTo(N, NewSt, true); return NewSt; @@ -5762,20 +5764,20 @@ static void ReplaceBITCAST(SDNode *Node, SelectionDAG &DAG, /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads. static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, SmallVectorImpl &Results) { - const EVT ResVT = N->getValueType(0); - SDLoc DL(N); + LoadSDNode *LD = cast(N); + const EVT ResVT = LD->getValueType(0); + const EVT MemVT = LD->getMemoryVT(); + if (ResVT != MemVT) + return; const auto NumEltsAndEltVT = getVectorLoweringShape(ResVT); if (!NumEltsAndEltVT) return; const auto [NumElts, EltVT] = NumEltsAndEltVT.value(); - LoadSDNode *LD = cast(N); - Align Alignment = LD->getAlign(); const auto &TD = DAG.getDataLayout(); - Align PrefAlign = - TD.getPrefTypeAlign(LD->getMemoryVT().getTypeForEVT(*DAG.getContext())); + Align PrefAlign = TD.getPrefTypeAlign(MemVT.getTypeForEVT(*DAG.getContext())); if (Alignment < PrefAlign) { // This load is not sufficiently aligned, so bail out and let this vector // load be scalarized. Note that we may still be able to emit smaller @@ -5806,9 +5808,10 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, break; } } + SDLoc DL(LD); // Copy regular operands - SmallVector OtherOps(N->ops()); + SmallVector OtherOps(LD->ops()); // The select routine does not have access to the LoadSDNode instance, so // pass along the extension information diff --git a/llvm/test/CodeGen/NVPTX/i128-ld-st.ll b/llvm/test/CodeGen/NVPTX/i128-ld-st.ll new file mode 100644 index 0000000000000..f3d04f133a8d4 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/i128-ld-st.ll @@ -0,0 +1,28 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -O0 -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -O0 -mcpu=sm_20 | %ptxas-verify %} + +target triple = "nvptx64-nvidia-cuda" + +define i128 @foo() { +; CHECK-LABEL: foo( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: bra.uni $L__BB0_1; +; CHECK-NEXT: $L__BB0_1: // %while.cond +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: mov.b64 %rd1, 0; +; CHECK-NEXT: ld.u8 %rd2, [%rd1]; +; CHECK-NEXT: st.v2.u64 [%rd1], {%rd2, %rd1}; +; CHECK-NEXT: bra.uni $L__BB0_1; +entry: + br label %while.cond + +while.cond: ; preds = %while.cond, %entry + %0 = load i8, ptr null, align 1 + %conv = zext i8 %0 to i128 + store i128 %conv, ptr null, align 16 + br label %while.cond +} From 2be73ec71bc73522d361db6487c58a7df49fb625 Mon Sep 17 00:00:00 2001 From: Alex Maclean Date: Thu, 1 May 2025 00:12:03 +0000 Subject: [PATCH 2/2] address comments --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 6 ++++ llvm/test/CodeGen/NVPTX/i128-ld-st.ll | 32 +++++++++------------ 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 2f24667cb3cde..44bfd303fc861 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -3196,6 +3196,9 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(N); const EVT ValVT = Val.getValueType(); const EVT MemVT = N->getMemoryVT(); + + // If we're truncating as part of the store, avoid lowering to a StoreV node. + // TODO: consider relaxing this restriction. if (ValVT != MemVT) return SDValue(); @@ -5767,6 +5770,9 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, LoadSDNode *LD = cast(N); const EVT ResVT = LD->getValueType(0); const EVT MemVT = LD->getMemoryVT(); + + // If we're doing sign/zero extension as part of the load, avoid lowering to + // a LoadV node. TODO: consider relaxing this restriction. if (ResVT != MemVT) return; diff --git a/llvm/test/CodeGen/NVPTX/i128-ld-st.ll b/llvm/test/CodeGen/NVPTX/i128-ld-st.ll index f3d04f133a8d4..41cffe9cdbf90 100644 --- a/llvm/test/CodeGen/NVPTX/i128-ld-st.ll +++ b/llvm/test/CodeGen/NVPTX/i128-ld-st.ll @@ -4,25 +4,21 @@ target triple = "nvptx64-nvidia-cuda" -define i128 @foo() { +define i128 @foo(ptr %p, ptr %o) { ; CHECK-LABEL: foo( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: bra.uni $L__BB0_1; -; CHECK-NEXT: $L__BB0_1: // %while.cond -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: mov.b64 %rd1, 0; -; CHECK-NEXT: ld.u8 %rd2, [%rd1]; -; CHECK-NEXT: st.v2.u64 [%rd1], {%rd2, %rd1}; -; CHECK-NEXT: bra.uni $L__BB0_1; -entry: - br label %while.cond - -while.cond: ; preds = %while.cond, %entry - %0 = load i8, ptr null, align 1 - %conv = zext i8 %0 to i128 - store i128 %conv, ptr null, align 16 - br label %while.cond +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd2, [foo_param_1]; +; CHECK-NEXT: ld.param.u64 %rd1, [foo_param_0]; +; CHECK-NEXT: ld.u8 %rd3, [%rd1]; +; CHECK-NEXT: mov.b64 %rd4, 0; +; CHECK-NEXT: st.v2.u64 [%rd2], {%rd3, %rd4}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd4}; +; CHECK-NEXT: ret; + %c = load i8, ptr %p, align 1 + %i = zext i8 %c to i128 + store i128 %i, ptr %o, align 16 + ret i128 %i }