-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[NVPTX] fix type propagation when expanding Store[V4 -> V8] #151576
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-backend-nvptx Author: Princeton Ferro (Prince781) ChangesThis was an edge case we missed. Propagate the correct type when expanding a StoreV4 x <2 x float> to StoreV8 x float. Full diff: https://github.com/llvm/llvm-project/pull/151576.diff 2 Files Affected:
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 4fd362303b6e5..65d1be3a3847d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -4917,7 +4917,6 @@ combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
return SDValue();
auto *LD = cast<MemSDNode>(N);
- EVT MemVT = LD->getMemoryVT();
SDLoc DL(LD);
// the new opcode after we double the number of operands
@@ -4958,9 +4957,9 @@ combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
NewVTs.append(LD->value_begin() + OldNumOutputs, LD->value_end());
// Create the new load
- SDValue NewLoad =
- DCI.DAG.getMemIntrinsicNode(Opcode, DL, DCI.DAG.getVTList(NewVTs),
- Operands, MemVT, LD->getMemOperand());
+ SDValue NewLoad = DCI.DAG.getMemIntrinsicNode(
+ Opcode, DL, DCI.DAG.getVTList(NewVTs), Operands, LD->getMemoryVT(),
+ LD->getMemOperand());
// Now we use a combination of BUILD_VECTORs and a MERGE_VALUES node to keep
// the outputs the same. These nodes will be optimized away in later
@@ -5002,7 +5001,6 @@ static SDValue combinePackingMovIntoStore(SDNode *N,
return SDValue();
auto *ST = cast<MemSDNode>(N);
- EVT MemVT = ElementVT.getVectorElementType();
// The new opcode after we double the number of operands.
NVPTXISD::NodeType Opcode;
@@ -5011,11 +5009,9 @@ static SDValue combinePackingMovIntoStore(SDNode *N,
// Any packed type is legal, so the legalizer will not have lowered
// ISD::STORE -> NVPTXISD::Store (unless it's under-aligned). We have to do
// it here.
- MemVT = ST->getMemoryVT();
Opcode = NVPTXISD::StoreV2;
break;
case NVPTXISD::StoreV2:
- MemVT = ST->getMemoryVT();
Opcode = NVPTXISD::StoreV4;
break;
case NVPTXISD::StoreV4:
@@ -5066,7 +5062,7 @@ static SDValue combinePackingMovIntoStore(SDNode *N,
// Now we replace the store
return DCI.DAG.getMemIntrinsicNode(Opcode, SDLoc(N), N->getVTList(), Operands,
- MemVT, ST->getMemOperand());
+ ST->getMemoryVT(), ST->getMemOperand());
}
static SDValue PerformStoreCombine(SDNode *N,
diff --git a/llvm/test/CodeGen/NVPTX/fold-movs.ll b/llvm/test/CodeGen/NVPTX/fold-movs.ll
new file mode 100644
index 0000000000000..b5f62c0c97e4c
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/fold-movs.ll
@@ -0,0 +1,38 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mcpu=sm_100 -mattr=+ptx88 -O3 -disable-post-ra \
+; RUN: -frame-pointer=all -verify-machineinstrs \
+; RUN: | FileCheck --check-prefixes=CHECK-F32X2
+; RUN: %if ptxas-12.7 %{ \
+; RUN: llc < %s -mcpu=sm_100 -mattr=+ptx88 -O3 -disable-post-ra \
+; RUN: -frame-pointer=all -verify-machineinstrs | %ptxas-verify -arch=sm_100 \
+; RUN: %}
+target triple = "nvptx64-nvidia-cuda"
+
+; Since fdiv doesn't support f32x2, this will create BUILD_VECTORs that will be
+; folded into the store, turning it into st.global.v8.b32.
+define void @writevec(<8 x float> %v1, <8 x float> %v2, ptr addrspace(1) %p) {
+; CHECK-F32X2-LABEL: writevec(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .b32 %r<25>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<2>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [writevec_param_0];
+; CHECK-F32X2-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [writevec_param_0+16];
+; CHECK-F32X2-NEXT: ld.param.v4.b32 {%r9, %r10, %r11, %r12}, [writevec_param_1+16];
+; CHECK-F32X2-NEXT: div.rn.f32 %r13, %r8, %r12;
+; CHECK-F32X2-NEXT: div.rn.f32 %r14, %r7, %r11;
+; CHECK-F32X2-NEXT: div.rn.f32 %r15, %r6, %r10;
+; CHECK-F32X2-NEXT: div.rn.f32 %r16, %r5, %r9;
+; CHECK-F32X2-NEXT: ld.param.v4.b32 {%r17, %r18, %r19, %r20}, [writevec_param_1];
+; CHECK-F32X2-NEXT: div.rn.f32 %r21, %r4, %r20;
+; CHECK-F32X2-NEXT: div.rn.f32 %r22, %r3, %r19;
+; CHECK-F32X2-NEXT: div.rn.f32 %r23, %r2, %r18;
+; CHECK-F32X2-NEXT: div.rn.f32 %r24, %r1, %r17;
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [writevec_param_2];
+; CHECK-F32X2-NEXT: st.global.v8.b32 [%rd1], {%r24, %r23, %r22, %r21, %r16, %r15, %r14, %r13};
+; CHECK-F32X2-NEXT: ret;
+ %v = fdiv <8 x float> %v1, %v2
+ store <8 x float> %v, ptr addrspace(1) %p, align 32
+ ret void
+}
|
AlexMaclean
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks reasonable to me.
35b58ca to
8946dab
Compare
Propagate the correct type when expanding a StoreV4 x <2 x float> to StoreV8 x float.
8946dab to
78e222a
Compare
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/11/builds/20836 Here is the relevant piece of the build log for the reference |
This was an edge case we missed. Propagate the correct type when expanding a StoreV4 x <2 x float> to StoreV8 x float.