Skip to content

Commit 028c586

Browse files
committed
[DAGCombiner] Spill dynamic insertelt chain in one go
A chain of dynamic insertelts with can be spilled at once. This avoids each insertelt being spilled in DAGTypeLegalizer which reduces code size and compile time.
1 parent 3af95f0 commit 028c586

File tree

2 files changed

+266
-0
lines changed

2 files changed

+266
-0
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23445,6 +23445,65 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
2344523445
// inselt undef, InVal, EltNo --> build_vector < InVal, InVal, ... >
2344623446
if (InVec.isUndef() && TLI.shouldSplatInsEltVarIndex(VT))
2344723447
return DAG.getSplat(VT, DL, InVal);
23448+
23449+
if (TLI.getTypeAction(*DAG.getContext(), VT) ==
23450+
TargetLowering::TypeSplitVector) {
23451+
// For dynamic insertelts, the type legalizer may spill the entire
23452+
// vector. For a chain of dynamic insertelts, this can be really
23453+
// inefficient and bad for compile time. If each insertelt is only fed
23454+
// into the next, the vector is write-only across this chain, and we can
23455+
// just spill once.
23456+
SmallVector<SDNode *> Seq{N};
23457+
while (true) {
23458+
SDValue InVec = Seq.back()->getOperand(0);
23459+
if (InVec.getOpcode() != ISD::INSERT_VECTOR_ELT ||
23460+
isa<ConstantSDNode>(InVec.getOperand(2)))
23461+
break;
23462+
Seq.push_back(InVec.getNode());
23463+
}
23464+
23465+
// Only care about chains, otherwise this instruction can be handled by
23466+
// the type legalizer just fine.
23467+
if (Seq.size() > 1) {
23468+
// In cases where the vector is illegal it will be broken down into
23469+
// parts and stored in parts - we should use the alignment for the
23470+
// smallest part.
23471+
Align SmallestAlign = DAG.getReducedAlign(VT, /*UseABI=*/false);
23472+
SDValue StackPtr =
23473+
DAG.CreateStackTemporary(VT.getStoreSize(), SmallestAlign);
23474+
auto &MF = DAG.getMachineFunction();
23475+
auto FrameIndex =
23476+
cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
23477+
auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
23478+
23479+
// Begin spilling
23480+
SDValue InVec = Seq.back()->getOperand(0);
23481+
SDValue Store = DAG.getStore(DAG.getEntryNode(), DL, InVec, StackPtr,
23482+
PtrInfo, SmallestAlign);
23483+
23484+
// Lower each dynamic insertelt to a store
23485+
for (SDNode *N : reverse(Seq)) {
23486+
SDValue Elmnt = N->getOperand(1);
23487+
SDValue Index = N->getOperand(2);
23488+
23489+
// Store the new element. This may be larger than the vector element
23490+
// type, so use a truncating store.
23491+
SDValue EltPtr =
23492+
TLI.getVectorElementPointer(DAG, StackPtr, VT, Index);
23493+
EVT EltVT = Elmnt.getValueType();
23494+
Store = DAG.getTruncStore(
23495+
Store, DL, Elmnt, EltPtr, MachinePointerInfo::getUnknownStack(MF),
23496+
EltVT,
23497+
commonAlignment(SmallestAlign, EltVT.getFixedSizeInBits() / 8));
23498+
}
23499+
23500+
// Load the spilled vector
23501+
SDValue Load =
23502+
DAG.getLoad(VT, DL, Store, StackPtr, PtrInfo, SmallestAlign);
23503+
return Load.getValue(0);
23504+
}
23505+
}
23506+
2344823507
return SDValue();
2344923508
}
2345023509

Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
2+
; RUN: llc < %s -mcpu=sm_20 | FileCheck %s
3+
; RUN: %if ptxas %{ llc < %s -mcpu=sm_20 | %ptxas-verify %}
4+
target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
5+
target triple = "nvptx64-nvidia-cuda"
6+
7+
; COM: Spill the vector once.
8+
define ptx_kernel void @spill_once(ptr addrspace(3) %shared.mem, <8 x double> %vector, i32 %idx0, i32 %idx1, i32 %idx2, i32 %idx3) local_unnamed_addr {
9+
; CHECK-LABEL: spill_once(
10+
; CHECK: {
11+
; CHECK-NEXT: .local .align 8 .b8 __local_depot0[64];
12+
; CHECK-NEXT: .reg .b64 %SP;
13+
; CHECK-NEXT: .reg .b64 %SPL;
14+
; CHECK-NEXT: .reg .b64 %rd<39>;
15+
; CHECK-EMPTY:
16+
; CHECK-NEXT: // %bb.0: // %entry
17+
; CHECK-NEXT: mov.b64 %SPL, __local_depot0;
18+
; CHECK-NEXT: cvta.local.u64 %SP, %SPL;
19+
; CHECK-NEXT: ld.param.b64 %rd1, [spill_once_param_0];
20+
; CHECK-NEXT: ld.shared.v2.b64 {%rd2, %rd3}, [%rd1];
21+
; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [spill_once_param_1];
22+
; CHECK-NEXT: ld.param.v2.b64 {%rd6, %rd7}, [spill_once_param_1+16];
23+
; CHECK-NEXT: ld.param.v2.b64 {%rd8, %rd9}, [spill_once_param_1+32];
24+
; CHECK-NEXT: ld.param.v2.b64 {%rd10, %rd11}, [spill_once_param_1+48];
25+
; CHECK-NEXT: ld.shared.b64 %rd12, [%rd1+16];
26+
; CHECK-NEXT: ld.shared.b64 %rd13, [%rd1+24];
27+
; CHECK-NEXT: ld.param.b32 %rd14, [spill_once_param_2];
28+
; CHECK-NEXT: and.b64 %rd15, %rd14, 7;
29+
; CHECK-NEXT: shl.b64 %rd16, %rd15, 3;
30+
; CHECK-NEXT: add.u64 %rd17, %SP, 0;
31+
; CHECK-NEXT: add.s64 %rd18, %rd17, %rd16;
32+
; CHECK-NEXT: ld.param.b32 %rd19, [spill_once_param_3];
33+
; CHECK-NEXT: and.b64 %rd20, %rd19, 7;
34+
; CHECK-NEXT: shl.b64 %rd21, %rd20, 3;
35+
; CHECK-NEXT: add.s64 %rd22, %rd17, %rd21;
36+
; CHECK-NEXT: ld.param.b32 %rd23, [spill_once_param_4];
37+
; CHECK-NEXT: and.b64 %rd24, %rd23, 7;
38+
; CHECK-NEXT: shl.b64 %rd25, %rd24, 3;
39+
; CHECK-NEXT: add.s64 %rd26, %rd17, %rd25;
40+
; CHECK-NEXT: st.b64 [%SP+56], %rd11;
41+
; CHECK-NEXT: st.b64 [%SP+48], %rd10;
42+
; CHECK-NEXT: st.b64 [%SP+40], %rd9;
43+
; CHECK-NEXT: st.b64 [%SP+32], %rd8;
44+
; CHECK-NEXT: st.b64 [%SP+24], %rd7;
45+
; CHECK-NEXT: st.b64 [%SP+16], %rd6;
46+
; CHECK-NEXT: st.b64 [%SP+8], %rd5;
47+
; CHECK-NEXT: st.b64 [%SP], %rd4;
48+
; CHECK-NEXT: st.b64 [%rd18], %rd2;
49+
; CHECK-NEXT: st.b64 [%rd22], %rd3;
50+
; CHECK-NEXT: st.b64 [%rd26], %rd12;
51+
; CHECK-NEXT: ld.param.b32 %rd27, [spill_once_param_5];
52+
; CHECK-NEXT: and.b64 %rd28, %rd27, 7;
53+
; CHECK-NEXT: shl.b64 %rd29, %rd28, 3;
54+
; CHECK-NEXT: add.s64 %rd30, %rd17, %rd29;
55+
; CHECK-NEXT: st.b64 [%rd30], %rd13;
56+
; CHECK-NEXT: ld.b64 %rd31, [%SP+8];
57+
; CHECK-NEXT: ld.b64 %rd32, [%SP];
58+
; CHECK-NEXT: ld.b64 %rd33, [%SP+24];
59+
; CHECK-NEXT: ld.b64 %rd34, [%SP+16];
60+
; CHECK-NEXT: ld.b64 %rd35, [%SP+40];
61+
; CHECK-NEXT: ld.b64 %rd36, [%SP+32];
62+
; CHECK-NEXT: ld.b64 %rd37, [%SP+56];
63+
; CHECK-NEXT: ld.b64 %rd38, [%SP+48];
64+
; CHECK-NEXT: st.shared.v2.b64 [%rd1+1072], {%rd38, %rd37};
65+
; CHECK-NEXT: st.shared.v2.b64 [%rd1+1056], {%rd36, %rd35};
66+
; CHECK-NEXT: st.shared.v2.b64 [%rd1+1040], {%rd34, %rd33};
67+
; CHECK-NEXT: st.shared.v2.b64 [%rd1+1024], {%rd32, %rd31};
68+
; CHECK-NEXT: ret;
69+
entry:
70+
%offset.0 = getelementptr i8, ptr addrspace(3) %shared.mem, i32 0
71+
%element.0 = load double, ptr addrspace(3) %offset.0, align 64
72+
%offset.1 = getelementptr i8, ptr addrspace(3) %shared.mem, i32 8
73+
%element.1 = load double, ptr addrspace(3) %offset.1, align 8
74+
%offset.2 = getelementptr i8, ptr addrspace(3) %shared.mem, i32 16
75+
%element.2 = load double, ptr addrspace(3) %offset.2, align 8
76+
%offset.3 = getelementptr i8, ptr addrspace(3) %shared.mem, i32 24
77+
%element.3 = load double, ptr addrspace(3) %offset.3, align 8
78+
%vector.build0 = insertelement <8 x double> %vector, double %element.0, i32 %idx0
79+
%vector.build1 = insertelement <8 x double> %vector.build0, double %element.1, i32 %idx1
80+
%vector.build2 = insertelement <8 x double> %vector.build1, double %element.2, i32 %idx2
81+
%vector.build3 = insertelement <8 x double> %vector.build2, double %element.3, i32 %idx3
82+
%location = getelementptr i8, ptr addrspace(3) %shared.mem, i32 1024
83+
store <8 x double> %vector.build3, ptr addrspace(3) %location, align 64
84+
ret void
85+
}
86+
87+
; COM: Spill the vector twice. Because these are in two different slots, the
88+
; resulting spill codes may be non-overlapping even though the insertelt
89+
; sequences overlap.
90+
define ptx_kernel void @spill_twice(ptr addrspace(3) %shared.mem, <8 x double> %vector, i32 %idx0, i32 %idx1, i32 %idx2, i32 %idx3) local_unnamed_addr {
91+
; CHECK-LABEL: spill_twice(
92+
; CHECK: {
93+
; CHECK-NEXT: .local .align 8 .b8 __local_depot1[128];
94+
; CHECK-NEXT: .reg .b64 %SP;
95+
; CHECK-NEXT: .reg .b64 %SPL;
96+
; CHECK-NEXT: .reg .b64 %rd<51>;
97+
; CHECK-EMPTY:
98+
; CHECK-NEXT: // %bb.0: // %entry
99+
; CHECK-NEXT: mov.b64 %SPL, __local_depot1;
100+
; CHECK-NEXT: cvta.local.u64 %SP, %SPL;
101+
; CHECK-NEXT: ld.param.b64 %rd1, [spill_twice_param_0];
102+
; CHECK-NEXT: ld.shared.v2.b64 {%rd2, %rd3}, [%rd1];
103+
; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [spill_twice_param_1];
104+
; CHECK-NEXT: ld.param.v2.b64 {%rd6, %rd7}, [spill_twice_param_1+16];
105+
; CHECK-NEXT: ld.param.v2.b64 {%rd8, %rd9}, [spill_twice_param_1+32];
106+
; CHECK-NEXT: ld.param.v2.b64 {%rd10, %rd11}, [spill_twice_param_1+48];
107+
; CHECK-NEXT: ld.shared.b64 %rd12, [%rd1+16];
108+
; CHECK-NEXT: ld.shared.b64 %rd13, [%rd1+24];
109+
; CHECK-NEXT: ld.param.b32 %rd14, [spill_twice_param_2];
110+
; CHECK-NEXT: and.b64 %rd15, %rd14, 7;
111+
; CHECK-NEXT: shl.b64 %rd16, %rd15, 3;
112+
; CHECK-NEXT: add.u64 %rd17, %SP, 0;
113+
; CHECK-NEXT: add.s64 %rd18, %rd17, %rd16;
114+
; CHECK-NEXT: add.u64 %rd19, %SP, 64;
115+
; CHECK-NEXT: add.s64 %rd20, %rd19, %rd16;
116+
; CHECK-NEXT: ld.param.b32 %rd21, [spill_twice_param_3];
117+
; CHECK-NEXT: and.b64 %rd22, %rd21, 7;
118+
; CHECK-NEXT: shl.b64 %rd23, %rd22, 3;
119+
; CHECK-NEXT: add.s64 %rd24, %rd17, %rd23;
120+
; CHECK-NEXT: add.s64 %rd25, %rd19, %rd23;
121+
; CHECK-NEXT: st.b64 [%SP+120], %rd11;
122+
; CHECK-NEXT: st.b64 [%SP+112], %rd10;
123+
; CHECK-NEXT: st.b64 [%SP+104], %rd9;
124+
; CHECK-NEXT: st.b64 [%SP+96], %rd8;
125+
; CHECK-NEXT: st.b64 [%SP+88], %rd7;
126+
; CHECK-NEXT: st.b64 [%SP+80], %rd6;
127+
; CHECK-NEXT: st.b64 [%SP+72], %rd5;
128+
; CHECK-NEXT: st.b64 [%SP+64], %rd4;
129+
; CHECK-NEXT: st.b64 [%rd20], %rd2;
130+
; CHECK-NEXT: st.b64 [%rd25], %rd3;
131+
; CHECK-NEXT: ld.param.b32 %rd26, [spill_twice_param_4];
132+
; CHECK-NEXT: and.b64 %rd27, %rd26, 7;
133+
; CHECK-NEXT: shl.b64 %rd28, %rd27, 3;
134+
; CHECK-NEXT: add.s64 %rd29, %rd19, %rd28;
135+
; CHECK-NEXT: st.b64 [%rd29], %rd12;
136+
; CHECK-NEXT: add.s64 %rd30, %rd17, %rd28;
137+
; CHECK-NEXT: ld.b64 %rd31, [%SP+72];
138+
; CHECK-NEXT: ld.b64 %rd32, [%SP+64];
139+
; CHECK-NEXT: ld.b64 %rd33, [%SP+88];
140+
; CHECK-NEXT: ld.b64 %rd34, [%SP+80];
141+
; CHECK-NEXT: ld.b64 %rd35, [%SP+104];
142+
; CHECK-NEXT: ld.b64 %rd36, [%SP+96];
143+
; CHECK-NEXT: ld.b64 %rd37, [%SP+120];
144+
; CHECK-NEXT: ld.b64 %rd38, [%SP+112];
145+
; CHECK-NEXT: st.b64 [%SP+56], %rd11;
146+
; CHECK-NEXT: st.b64 [%SP+48], %rd10;
147+
; CHECK-NEXT: st.b64 [%SP+40], %rd9;
148+
; CHECK-NEXT: st.b64 [%SP+32], %rd8;
149+
; CHECK-NEXT: st.b64 [%SP+24], %rd7;
150+
; CHECK-NEXT: st.b64 [%SP+16], %rd6;
151+
; CHECK-NEXT: st.b64 [%SP+8], %rd5;
152+
; CHECK-NEXT: st.b64 [%SP], %rd4;
153+
; CHECK-NEXT: st.b64 [%rd18], %rd2;
154+
; CHECK-NEXT: st.b64 [%rd24], %rd3;
155+
; CHECK-NEXT: st.b64 [%rd30], %rd12;
156+
; CHECK-NEXT: ld.param.b32 %rd39, [spill_twice_param_5];
157+
; CHECK-NEXT: and.b64 %rd40, %rd39, 7;
158+
; CHECK-NEXT: shl.b64 %rd41, %rd40, 3;
159+
; CHECK-NEXT: add.s64 %rd42, %rd17, %rd41;
160+
; CHECK-NEXT: st.b64 [%rd42], %rd13;
161+
; CHECK-NEXT: ld.b64 %rd43, [%SP+8];
162+
; CHECK-NEXT: ld.b64 %rd44, [%SP];
163+
; CHECK-NEXT: ld.b64 %rd45, [%SP+24];
164+
; CHECK-NEXT: ld.b64 %rd46, [%SP+16];
165+
; CHECK-NEXT: ld.b64 %rd47, [%SP+40];
166+
; CHECK-NEXT: ld.b64 %rd48, [%SP+32];
167+
; CHECK-NEXT: ld.b64 %rd49, [%SP+56];
168+
; CHECK-NEXT: ld.b64 %rd50, [%SP+48];
169+
; CHECK-NEXT: st.shared.v2.b64 [%rd1+1072], {%rd50, %rd49};
170+
; CHECK-NEXT: st.shared.v2.b64 [%rd1+1056], {%rd48, %rd47};
171+
; CHECK-NEXT: st.shared.v2.b64 [%rd1+1040], {%rd46, %rd45};
172+
; CHECK-NEXT: st.shared.v2.b64 [%rd1+1024], {%rd44, %rd43};
173+
; CHECK-NEXT: st.shared.v2.b64 [%rd1+1144], {%rd38, %rd37};
174+
; CHECK-NEXT: st.shared.v2.b64 [%rd1+1128], {%rd36, %rd35};
175+
; CHECK-NEXT: st.shared.v2.b64 [%rd1+1112], {%rd34, %rd33};
176+
; CHECK-NEXT: st.shared.v2.b64 [%rd1+1096], {%rd32, %rd31};
177+
; CHECK-NEXT: ret;
178+
entry:
179+
%offset.0 = getelementptr i8, ptr addrspace(3) %shared.mem, i32 0
180+
%element.0 = load double, ptr addrspace(3) %offset.0, align 64
181+
%offset.1 = getelementptr i8, ptr addrspace(3) %shared.mem, i32 8
182+
%element.1 = load double, ptr addrspace(3) %offset.1, align 8
183+
%offset.2 = getelementptr i8, ptr addrspace(3) %shared.mem, i32 16
184+
%element.2 = load double, ptr addrspace(3) %offset.2, align 8
185+
%offset.3 = getelementptr i8, ptr addrspace(3) %shared.mem, i32 24
186+
%element.3 = load double, ptr addrspace(3) %offset.3, align 8
187+
188+
; COM: begin chain 1
189+
%vector.build0 = insertelement <8 x double> %vector, double %element.0, i32 %idx0
190+
%vector.build1 = insertelement <8 x double> %vector.build0, double %element.1, i32 %idx1
191+
192+
; COM: interleave a second chain of insertelements
193+
%vector.build1-2 = insertelement <8 x double> %vector.build1, double %element.2, i32 %idx2
194+
195+
; COM: continue chain 1
196+
%vector.build2 = insertelement <8 x double> %vector.build1, double %element.2, i32 %idx2
197+
%vector.build3 = insertelement <8 x double> %vector.build2, double %element.3, i32 %idx3
198+
199+
; COM: save chain 1
200+
%location = getelementptr i8, ptr addrspace(3) %shared.mem, i32 1024
201+
store <8 x double> %vector.build3, ptr addrspace(3) %location, align 64
202+
203+
; COM: save chain 2
204+
%location-2 = getelementptr i8, ptr addrspace(3) %shared.mem, i32 1096
205+
store <8 x double> %vector.build1-2, ptr addrspace(3) %location-2, align 64
206+
ret void
207+
}

0 commit comments

Comments
 (0)