Skip to content

Conversation

@AlexMaclean
Copy link
Member

@AlexMaclean AlexMaclean commented Apr 30, 2025

Ensure that when custom lowering a vector load/store to a multi-output load/store node we confirm that the memory value type matches the type used by the node. Also add some asserts for basic sanity checking of load size.

Fixes #138034

@llvmbot
Copy link
Member

llvmbot commented Apr 30, 2025

@llvm/pr-subscribers-backend-nvptx

Author: Alex MacLean (AlexMaclean)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/138049.diff

3 Files Affected:

  • (modified) llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp (+13)
  • (modified) llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp (+15-12)
  • (added) llvm/test/CodeGen/NVPTX/i128-ld-st.ll (+28)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 295ed666a1902..57f35a827d9cf 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -26,6 +26,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/MathExtras.h"
 #include <optional>
 
 using namespace llvm;
@@ -1141,6 +1142,9 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
   else
     FromType = getLdStRegType(ScalarVT);
 
+  assert(isPowerOf2_32(FromTypeWidth) && FromTypeWidth >= 8 &&
+         FromTypeWidth <= 128 && "Invalid width for load");
+
   // Create the machine instruction DAG
   SDValue Offset, Base;
   SelectADDR(N->getOperand(1), Base, Offset);
@@ -1236,6 +1240,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
     FromType = NVPTX::PTXLdStInstCode::Untyped;
   }
 
+  assert(isPowerOf2_32(FromTypeWidth) && FromTypeWidth >= 8 &&
+         FromTypeWidth <= 128 && TotalWidth <= 128 && "Invalid width for load");
+
   SDValue Offset, Base;
   SelectADDR(N->getOperand(1), Base, Offset);
   SDValue Ops[] = {getI32Imm(Ordering, DL),
@@ -1453,6 +1460,9 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
   // Create the machine instruction DAG
   SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal();
 
+  assert(isPowerOf2_32(ToTypeWidth) && ToTypeWidth >= 8 &&
+         ToTypeWidth <= 128 && "Invalid width for store");
+
   SDValue Offset, Base;
   SelectADDR(ST->getBasePtr(), Base, Offset);
 
@@ -1537,6 +1547,9 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
     ToType = NVPTX::PTXLdStInstCode::Untyped;
   }
 
+  assert(isPowerOf2_32(ToTypeWidth) && ToTypeWidth >= 8 &&
+         ToTypeWidth <= 128 && TotalWidth <= 128 && "Invalid width for store");
+
   SDValue Offset, Base;
   SelectADDR(N2, Base, Offset);
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index c41741ed10232..2f24667cb3cde 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -3191,20 +3191,22 @@ SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue
 NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
-  SDNode *N = Op.getNode();
+  MemSDNode *N = cast<MemSDNode>(Op.getNode());
   SDValue Val = N->getOperand(1);
   SDLoc DL(N);
-  EVT ValVT = Val.getValueType();
+  const EVT ValVT = Val.getValueType();
+  const EVT MemVT = N->getMemoryVT();
+  if (ValVT != MemVT)
+    return SDValue();
 
   const auto NumEltsAndEltVT = getVectorLoweringShape(ValVT);
   if (!NumEltsAndEltVT)
     return SDValue();
   const auto [NumElts, EltVT] = NumEltsAndEltVT.value();
 
-  MemSDNode *MemSD = cast<MemSDNode>(N);
   const DataLayout &TD = DAG.getDataLayout();
 
-  Align Alignment = MemSD->getAlign();
+  Align Alignment = N->getAlign();
   Align PrefAlign = TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext()));
   if (Alignment < PrefAlign) {
     // This store is not sufficiently aligned, so bail out and let this vector
@@ -3267,7 +3269,7 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
 
   SDValue NewSt =
       DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
-                              MemSD->getMemoryVT(), MemSD->getMemOperand());
+                              N->getMemoryVT(), N->getMemOperand());
 
   // return DCI.CombineTo(N, NewSt, true);
   return NewSt;
@@ -5762,20 +5764,20 @@ static void ReplaceBITCAST(SDNode *Node, SelectionDAG &DAG,
 /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
 static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
                               SmallVectorImpl<SDValue> &Results) {
-  const EVT ResVT = N->getValueType(0);
-  SDLoc DL(N);
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  const EVT ResVT = LD->getValueType(0);
+  const EVT MemVT = LD->getMemoryVT();
+  if (ResVT != MemVT)
+    return;
 
   const auto NumEltsAndEltVT = getVectorLoweringShape(ResVT);
   if (!NumEltsAndEltVT)
     return;
   const auto [NumElts, EltVT] = NumEltsAndEltVT.value();
 
-  LoadSDNode *LD = cast<LoadSDNode>(N);
-
   Align Alignment = LD->getAlign();
   const auto &TD = DAG.getDataLayout();
-  Align PrefAlign =
-      TD.getPrefTypeAlign(LD->getMemoryVT().getTypeForEVT(*DAG.getContext()));
+  Align PrefAlign = TD.getPrefTypeAlign(MemVT.getTypeForEVT(*DAG.getContext()));
   if (Alignment < PrefAlign) {
     // This load is not sufficiently aligned, so bail out and let this vector
     // load be scalarized.  Note that we may still be able to emit smaller
@@ -5806,9 +5808,10 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
     break;
   }
   }
+  SDLoc DL(LD);
 
   // Copy regular operands
-  SmallVector<SDValue, 8> OtherOps(N->ops());
+  SmallVector<SDValue, 8> OtherOps(LD->ops());
 
   // The select routine does not have access to the LoadSDNode instance, so
   // pass along the extension information
diff --git a/llvm/test/CodeGen/NVPTX/i128-ld-st.ll b/llvm/test/CodeGen/NVPTX/i128-ld-st.ll
new file mode 100644
index 0000000000000..f3d04f133a8d4
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/i128-ld-st.ll
@@ -0,0 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -O0 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -O0 -mcpu=sm_20 | %ptxas-verify %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+define i128 @foo() {
+; CHECK-LABEL: foo(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    bra.uni $L__BB0_1;
+; CHECK-NEXT:  $L__BB0_1: // %while.cond
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    mov.b64 %rd1, 0;
+; CHECK-NEXT:    ld.u8 %rd2, [%rd1];
+; CHECK-NEXT:    st.v2.u64 [%rd1], {%rd2, %rd1};
+; CHECK-NEXT:    bra.uni $L__BB0_1;
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond, %entry
+  %0 = load i8, ptr null, align 1
+  %conv = zext i8 %0 to i128
+  store i128 %conv, ptr null, align 16
+  br label %while.cond
+}

@github-actions
Copy link

github-actions bot commented Apr 30, 2025

✅ With the latest revision this PR passed the C/C++ code formatter.

@AlexMaclean AlexMaclean force-pushed the dev/amaclean/upstream/quick-fix-i128 branch from 403c693 to 3e4b9c3 Compare April 30, 2025 23:17
Copy link
Contributor

@jhuber6 jhuber6 left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, looks straightforward enough to me but someone more familiar with the NVPTX backend might know more.

Copy link
Member

@Artem-B Artem-B left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM, with minor nits.
Thank you for the quick fix!

; CHECK-NEXT: ld.u8 %rd2, [%rd1];
; CHECK-NEXT: st.v2.u64 [%rd1], {%rd2, %rd1};
; CHECK-NEXT: bra.uni $L__BB0_1;
entry:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the test case could be cleaned up a bit, dropping the loop and using valid pointers, so there are no questions of whether the input is valid.

E.g. https://godbolt.org/z/EsvbY3dav

@AlexMaclean AlexMaclean merged commit 6efdcc1 into llvm:main May 1, 2025
11 checks passed
@llvm-ci
Copy link
Collaborator

llvm-ci commented May 1, 2025

LLVM Buildbot has detected a new failure on builder openmp-s390x-linux running on systemz-1 while building llvm at step 6 "test-openmp".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/88/builds/11125

Here is the relevant piece of the build log for the reference
Step 6 (test-openmp) failure: test (failure)
******************** TEST 'libomp :: tasking/issue-94260-2.c' FAILED ********************
Exit Code: -11

Command Output (stdout):
--
# RUN: at line 1
/home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/./bin/clang -fopenmp   -I /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -I /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.src/openmp/runtime/test -L /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/runtimes/runtimes-bins/openmp/runtime/src  -fno-omit-frame-pointer -mbackchain -I /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.src/openmp/runtime/test/ompt /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.src/openmp/runtime/test/tasking/issue-94260-2.c -o /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/runtimes/runtimes-bins/openmp/runtime/test/tasking/Output/issue-94260-2.c.tmp -lm -latomic && /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/runtimes/runtimes-bins/openmp/runtime/test/tasking/Output/issue-94260-2.c.tmp
# executed command: /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/./bin/clang -fopenmp -I /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -I /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.src/openmp/runtime/test -L /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -fno-omit-frame-pointer -mbackchain -I /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.src/openmp/runtime/test/ompt /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.src/openmp/runtime/test/tasking/issue-94260-2.c -o /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/runtimes/runtimes-bins/openmp/runtime/test/tasking/Output/issue-94260-2.c.tmp -lm -latomic
# executed command: /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/runtimes/runtimes-bins/openmp/runtime/test/tasking/Output/issue-94260-2.c.tmp
# note: command had no output on stdout or stderr
# error: command failed with exit status: -11

--

********************


IanWood1 pushed a commit to IanWood1/llvm-project that referenced this pull request May 6, 2025
Ensure that when custom lowering a vector load/store to a multi-output
load/store node we confirm that the memory value type matches the type
used by the node. Also add some asserts for basic sanity checking of
load size.

Fixes llvm#138034
GeorgeARM pushed a commit to GeorgeARM/llvm-project that referenced this pull request May 7, 2025
Ensure that when custom lowering a vector load/store to a multi-output
load/store node we confirm that the memory value type matches the type
used by the node. Also add some asserts for basic sanity checking of
load size.

Fixes llvm#138034
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

[NVPTX] Failure in PTX when generating code for i128 regresses libc tests

5 participants