Skip to content

Commit feb03f3

Browse files
committed
[DAGCombiner] Spill dynamic insertelt chain in one go
A chain of dynamic insertelts (that is: insertelt (insertelt (...)) with dynamic indices) can be spilled at once. This avoids each insertelt being spilled in DAGTypeLegalizer which reduces code size and compile time.
1 parent 11fb835 commit feb03f3

File tree

2 files changed

+266
-0
lines changed

2 files changed

+266
-0
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23445,6 +23445,57 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
2344523445
// inselt undef, InVal, EltNo --> build_vector < InVal, InVal, ... >
2344623446
if (InVec.isUndef() && TLI.shouldSplatInsEltVarIndex(VT))
2344723447
return DAG.getSplat(VT, DL, InVal);
23448+
23449+
// For dynamic insertelts, the type legalizer will spill the entire vector.
23450+
// For a chain of dynamic insertelts, this can be really inefficient and
23451+
// bad for compile time. If each insertelt is only fed into the next, the
23452+
// vector is write-only across this chain, and we can just spill once.
23453+
SmallVector<SDNode *> Seq{N};
23454+
while (true) {
23455+
SDValue InVec = Seq.back()->getOperand(0);
23456+
SDValue EltNo = InVec.getOperand(2);
23457+
if (!(InVec.getOpcode() == ISD::INSERT_VECTOR_ELT &&
23458+
!isa<ConstantSDNode>(EltNo)))
23459+
break;
23460+
Seq.push_back(InVec.getNode());
23461+
}
23462+
23463+
// Only care about chains, otherwise this instruction can be handled by
23464+
// the type legalizer just fine.
23465+
if (Seq.size() > 1) {
23466+
// In cases where the vector is illegal it will be broken down into parts
23467+
// and stored in parts - we should use the alignment for the smallest part.
23468+
Align SmallestAlign = DAG.getReducedAlign(VT, /*UseABI=*/false);
23469+
SDValue StackPtr = DAG.CreateStackTemporary(VT.getStoreSize(), SmallestAlign);
23470+
auto &MF = DAG.getMachineFunction();
23471+
auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
23472+
auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
23473+
23474+
// Begin spilling
23475+
SDValue InVec = Seq.back()->getOperand(0);
23476+
SDValue Store = DAG.getStore(DAG.getEntryNode(), DL, InVec, StackPtr, PtrInfo,
23477+
SmallestAlign);
23478+
23479+
// Lower each dynamic insertelt to a store
23480+
for (SDNode *N : reverse(Seq)) {
23481+
SDValue Elmnt = N->getOperand(1);
23482+
SDValue Index = N->getOperand(2);
23483+
23484+
// Store the new element. This may be larger than the vector element type,
23485+
// so use a truncating store.
23486+
SDValue EltPtr = TLI.getVectorElementPointer(DAG, StackPtr, VT, Index);
23487+
EVT EltVT = Elmnt.getValueType();
23488+
Store = DAG.getTruncStore(
23489+
Store, DL, Elmnt, EltPtr, MachinePointerInfo::getUnknownStack(MF),
23490+
EltVT, commonAlignment(SmallestAlign,
23491+
EltVT.getFixedSizeInBits() / 8));
23492+
}
23493+
23494+
// Load the spilled vector
23495+
SDValue Load = DAG.getLoad(VT, DL, Store, StackPtr, PtrInfo, SmallestAlign);
23496+
return Load.getValue(0);
23497+
}
23498+
2344823499
return SDValue();
2344923500
}
2345023501

Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,215 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
2+
; RUN: llc < %s -mcpu=sm_20 | FileCheck %s
3+
; RUN: %if ptxas %{ llc < %s -mcpu=sm_20 | %ptxas-verify %}
4+
target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
5+
target triple = "nvptx64-nvidia-cuda"
6+
7+
; COM: Spill the vector once.
8+
define ptx_kernel void @spill_once(ptr addrspace(3) %shared.mem, <8 x double> %vector, i32 %idx0, i32 %idx1, i32 %idx2, i32 %idx3) local_unnamed_addr {
9+
; CHECK-LABEL: spill_once(
10+
; CHECK: {
11+
; CHECK-NEXT: .local .align 8 .b8 __local_depot0[64];
12+
; CHECK-NEXT: .reg .b64 %SP;
13+
; CHECK-NEXT: .reg .b64 %SPL;
14+
; CHECK-NEXT: .reg .b64 %rd<39>;
15+
; CHECK-EMPTY:
16+
; CHECK-NEXT: // %bb.0: // %entry
17+
; CHECK-NEXT: mov.b64 %SPL, __local_depot0;
18+
; CHECK-NEXT: cvta.local.u64 %SP, %SPL;
19+
; CHECK-NEXT: ld.param.b64 %rd1, [spill_once_param_0];
20+
; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [spill_once_param_1];
21+
; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [spill_once_param_1+16];
22+
; CHECK-NEXT: ld.param.v2.b64 {%rd6, %rd7}, [spill_once_param_1+32];
23+
; CHECK-NEXT: ld.param.v2.b64 {%rd8, %rd9}, [spill_once_param_1+48];
24+
; CHECK-NEXT: ld.shared.v2.b64 {%rd10, %rd11}, [%rd1];
25+
; CHECK-NEXT: ld.shared.v2.b64 {%rd12, %rd13}, [%rd1+16];
26+
; CHECK-NEXT: ld.param.b32 %rd14, [spill_once_param_2];
27+
; CHECK-NEXT: shl.b64 %rd15, %rd14, 3;
28+
; CHECK-NEXT: and.b64 %rd16, %rd15, 56;
29+
; CHECK-NEXT: add.u64 %rd17, %SP, 0;
30+
; CHECK-NEXT: add.s64 %rd18, %rd17, %rd16;
31+
; CHECK-NEXT: ld.param.b32 %rd19, [spill_once_param_3];
32+
; CHECK-NEXT: shl.b64 %rd20, %rd19, 3;
33+
; CHECK-NEXT: and.b64 %rd21, %rd20, 56;
34+
; CHECK-NEXT: add.s64 %rd22, %rd17, %rd21;
35+
; CHECK-NEXT: ld.param.b32 %rd23, [spill_once_param_4];
36+
; CHECK-NEXT: shl.b64 %rd24, %rd23, 3;
37+
; CHECK-NEXT: and.b64 %rd25, %rd24, 56;
38+
; CHECK-NEXT: add.s64 %rd26, %rd17, %rd25;
39+
; CHECK-NEXT: st.b64 [%SP+56], %rd9;
40+
; CHECK-NEXT: st.b64 [%SP+48], %rd8;
41+
; CHECK-NEXT: st.b64 [%SP+40], %rd7;
42+
; CHECK-NEXT: st.b64 [%SP+32], %rd6;
43+
; CHECK-NEXT: st.b64 [%SP+24], %rd5;
44+
; CHECK-NEXT: st.b64 [%SP+16], %rd4;
45+
; CHECK-NEXT: st.b64 [%SP+8], %rd3;
46+
; CHECK-NEXT: st.b64 [%SP], %rd2;
47+
; CHECK-NEXT: st.b64 [%rd18], %rd10;
48+
; CHECK-NEXT: st.b64 [%rd22], %rd11;
49+
; CHECK-NEXT: st.b64 [%rd26], %rd12;
50+
; CHECK-NEXT: ld.param.b32 %rd27, [spill_once_param_5];
51+
; CHECK-NEXT: shl.b64 %rd28, %rd27, 3;
52+
; CHECK-NEXT: and.b64 %rd29, %rd28, 56;
53+
; CHECK-NEXT: add.s64 %rd30, %rd17, %rd29;
54+
; CHECK-NEXT: st.b64 [%rd30], %rd13;
55+
; CHECK-NEXT: ld.b64 %rd31, [%SP+8];
56+
; CHECK-NEXT: ld.b64 %rd32, [%SP];
57+
; CHECK-NEXT: ld.b64 %rd33, [%SP+24];
58+
; CHECK-NEXT: ld.b64 %rd34, [%SP+16];
59+
; CHECK-NEXT: ld.b64 %rd35, [%SP+40];
60+
; CHECK-NEXT: ld.b64 %rd36, [%SP+32];
61+
; CHECK-NEXT: ld.b64 %rd37, [%SP+56];
62+
; CHECK-NEXT: ld.b64 %rd38, [%SP+48];
63+
; CHECK-NEXT: st.shared.v2.b64 [%rd1+1072], {%rd38, %rd37};
64+
; CHECK-NEXT: st.shared.v2.b64 [%rd1+1056], {%rd36, %rd35};
65+
; CHECK-NEXT: st.shared.v2.b64 [%rd1+1040], {%rd34, %rd33};
66+
; CHECK-NEXT: st.shared.v2.b64 [%rd1+1024], {%rd32, %rd31};
67+
; CHECK-NEXT: ret;
68+
entry:
69+
%offset.0 = getelementptr i8, ptr addrspace(3) %shared.mem, i32 0
70+
%element.0 = load double, ptr addrspace(3) %offset.0, align 64
71+
%offset.1 = getelementptr i8, ptr addrspace(3) %shared.mem, i32 8
72+
%element.1 = load double, ptr addrspace(3) %offset.1, align 8
73+
%offset.2 = getelementptr i8, ptr addrspace(3) %shared.mem, i32 16
74+
%element.2 = load double, ptr addrspace(3) %offset.2, align 8
75+
%offset.3 = getelementptr i8, ptr addrspace(3) %shared.mem, i32 24
76+
%element.3 = load double, ptr addrspace(3) %offset.3, align 8
77+
%vector.build0 = insertelement <8 x double> %vector, double %element.0, i32 %idx0
78+
%vector.build1 = insertelement <8 x double> %vector.build0, double %element.1, i32 %idx1
79+
%vector.build2 = insertelement <8 x double> %vector.build1, double %element.2, i32 %idx2
80+
%vector.build3 = insertelement <8 x double> %vector.build2, double %element.3, i32 %idx3
81+
%location = getelementptr i8, ptr addrspace(3) %shared.mem, i32 1024
82+
store <8 x double> %vector.build3, ptr addrspace(3) %location, align 64
83+
ret void
84+
}
85+
86+
; COM: Spill the vector twice. Because these are in two different slots, the
87+
; resulting spill codes may be non-overlapping even though the insertelt
88+
; sequences overlap.
89+
define ptx_kernel void @spill_twice(ptr addrspace(3) %shared.mem, <8 x double> %vector, i32 %idx0, i32 %idx1, i32 %idx2, i32 %idx3) local_unnamed_addr {
90+
; CHECK-LABEL: spill_twice(
91+
; CHECK: {
92+
; CHECK-NEXT: .local .align 8 .b8 __local_depot1[128];
93+
; CHECK-NEXT: .reg .b64 %SP;
94+
; CHECK-NEXT: .reg .b64 %SPL;
95+
; CHECK-NEXT: .reg .b32 %r<7>;
96+
; CHECK-NEXT: .reg .b64 %rd<54>;
97+
; CHECK-EMPTY:
98+
; CHECK-NEXT: // %bb.0: // %entry
99+
; CHECK-NEXT: mov.b64 %SPL, __local_depot1;
100+
; CHECK-NEXT: cvta.local.u64 %SP, %SPL;
101+
; CHECK-NEXT: ld.param.b64 %rd1, [spill_twice_param_0];
102+
; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [spill_twice_param_1];
103+
; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [spill_twice_param_1+16];
104+
; CHECK-NEXT: ld.param.v2.b64 {%rd6, %rd7}, [spill_twice_param_1+32];
105+
; CHECK-NEXT: ld.param.v2.b64 {%rd8, %rd9}, [spill_twice_param_1+48];
106+
; CHECK-NEXT: ld.param.b32 %r1, [spill_twice_param_2];
107+
; CHECK-NEXT: ld.param.b32 %r2, [spill_twice_param_3];
108+
; CHECK-NEXT: ld.param.b32 %r3, [spill_twice_param_4];
109+
; CHECK-NEXT: ld.shared.v2.b64 {%rd10, %rd11}, [%rd1];
110+
; CHECK-NEXT: ld.shared.v2.b64 {%rd12, %rd13}, [%rd1+16];
111+
; CHECK-NEXT: mul.wide.u32 %rd14, %r1, 8;
112+
; CHECK-NEXT: and.b64 %rd15, %rd14, 56;
113+
; CHECK-NEXT: add.u64 %rd16, %SP, 0;
114+
; CHECK-NEXT: add.s64 %rd17, %rd16, %rd15;
115+
; CHECK-NEXT: shl.b32 %r4, %r1, 3;
116+
; CHECK-NEXT: cvt.u64.u32 %rd18, %r4;
117+
; CHECK-NEXT: and.b64 %rd19, %rd18, 56;
118+
; CHECK-NEXT: add.u64 %rd20, %SP, 64;
119+
; CHECK-NEXT: add.s64 %rd21, %rd20, %rd19;
120+
; CHECK-NEXT: mul.wide.u32 %rd22, %r2, 8;
121+
; CHECK-NEXT: and.b64 %rd23, %rd22, 56;
122+
; CHECK-NEXT: add.s64 %rd24, %rd16, %rd23;
123+
; CHECK-NEXT: shl.b32 %r5, %r2, 3;
124+
; CHECK-NEXT: cvt.u64.u32 %rd25, %r5;
125+
; CHECK-NEXT: and.b64 %rd26, %rd25, 56;
126+
; CHECK-NEXT: add.s64 %rd27, %rd20, %rd26;
127+
; CHECK-NEXT: st.b64 [%SP+120], %rd9;
128+
; CHECK-NEXT: st.b64 [%SP+112], %rd8;
129+
; CHECK-NEXT: st.b64 [%SP+104], %rd7;
130+
; CHECK-NEXT: st.b64 [%SP+96], %rd6;
131+
; CHECK-NEXT: st.b64 [%SP+88], %rd5;
132+
; CHECK-NEXT: st.b64 [%SP+80], %rd4;
133+
; CHECK-NEXT: st.b64 [%SP+72], %rd3;
134+
; CHECK-NEXT: st.b64 [%SP+64], %rd2;
135+
; CHECK-NEXT: st.b64 [%rd21], %rd10;
136+
; CHECK-NEXT: st.b64 [%rd27], %rd11;
137+
; CHECK-NEXT: shl.b32 %r6, %r3, 3;
138+
; CHECK-NEXT: cvt.u64.u32 %rd28, %r6;
139+
; CHECK-NEXT: and.b64 %rd29, %rd28, 56;
140+
; CHECK-NEXT: add.s64 %rd30, %rd20, %rd29;
141+
; CHECK-NEXT: st.b64 [%rd30], %rd12;
142+
; CHECK-NEXT: mul.wide.u32 %rd31, %r3, 8;
143+
; CHECK-NEXT: and.b64 %rd32, %rd31, 56;
144+
; CHECK-NEXT: add.s64 %rd33, %rd16, %rd32;
145+
; CHECK-NEXT: ld.b64 %rd34, [%SP+72];
146+
; CHECK-NEXT: ld.b64 %rd35, [%SP+64];
147+
; CHECK-NEXT: ld.b64 %rd36, [%SP+88];
148+
; CHECK-NEXT: ld.b64 %rd37, [%SP+80];
149+
; CHECK-NEXT: ld.b64 %rd38, [%SP+104];
150+
; CHECK-NEXT: ld.b64 %rd39, [%SP+96];
151+
; CHECK-NEXT: ld.b64 %rd40, [%SP+120];
152+
; CHECK-NEXT: ld.b64 %rd41, [%SP+112];
153+
; CHECK-NEXT: st.b64 [%SP+56], %rd9;
154+
; CHECK-NEXT: st.b64 [%SP+48], %rd8;
155+
; CHECK-NEXT: st.b64 [%SP+40], %rd7;
156+
; CHECK-NEXT: st.b64 [%SP+32], %rd6;
157+
; CHECK-NEXT: st.b64 [%SP+24], %rd5;
158+
; CHECK-NEXT: st.b64 [%SP+16], %rd4;
159+
; CHECK-NEXT: st.b64 [%SP+8], %rd3;
160+
; CHECK-NEXT: st.b64 [%SP], %rd2;
161+
; CHECK-NEXT: st.b64 [%rd17], %rd10;
162+
; CHECK-NEXT: st.b64 [%rd24], %rd11;
163+
; CHECK-NEXT: st.b64 [%rd33], %rd12;
164+
; CHECK-NEXT: ld.param.b32 %rd42, [spill_twice_param_5];
165+
; CHECK-NEXT: shl.b64 %rd43, %rd42, 3;
166+
; CHECK-NEXT: and.b64 %rd44, %rd43, 56;
167+
; CHECK-NEXT: add.s64 %rd45, %rd16, %rd44;
168+
; CHECK-NEXT: st.b64 [%rd45], %rd13;
169+
; CHECK-NEXT: ld.b64 %rd46, [%SP+8];
170+
; CHECK-NEXT: ld.b64 %rd47, [%SP];
171+
; CHECK-NEXT: ld.b64 %rd48, [%SP+24];
172+
; CHECK-NEXT: ld.b64 %rd49, [%SP+16];
173+
; CHECK-NEXT: ld.b64 %rd50, [%SP+40];
174+
; CHECK-NEXT: ld.b64 %rd51, [%SP+32];
175+
; CHECK-NEXT: ld.b64 %rd52, [%SP+56];
176+
; CHECK-NEXT: ld.b64 %rd53, [%SP+48];
177+
; CHECK-NEXT: st.shared.v2.b64 [%rd1+1072], {%rd53, %rd52};
178+
; CHECK-NEXT: st.shared.v2.b64 [%rd1+1056], {%rd51, %rd50};
179+
; CHECK-NEXT: st.shared.v2.b64 [%rd1+1040], {%rd49, %rd48};
180+
; CHECK-NEXT: st.shared.v2.b64 [%rd1+1024], {%rd47, %rd46};
181+
; CHECK-NEXT: st.shared.v2.b64 [%rd1+1144], {%rd41, %rd40};
182+
; CHECK-NEXT: st.shared.v2.b64 [%rd1+1128], {%rd39, %rd38};
183+
; CHECK-NEXT: st.shared.v2.b64 [%rd1+1112], {%rd37, %rd36};
184+
; CHECK-NEXT: st.shared.v2.b64 [%rd1+1096], {%rd35, %rd34};
185+
; CHECK-NEXT: ret;
186+
entry:
187+
%offset.0 = getelementptr i8, ptr addrspace(3) %shared.mem, i32 0
188+
%element.0 = load double, ptr addrspace(3) %offset.0, align 64
189+
%offset.1 = getelementptr i8, ptr addrspace(3) %shared.mem, i32 8
190+
%element.1 = load double, ptr addrspace(3) %offset.1, align 8
191+
%offset.2 = getelementptr i8, ptr addrspace(3) %shared.mem, i32 16
192+
%element.2 = load double, ptr addrspace(3) %offset.2, align 8
193+
%offset.3 = getelementptr i8, ptr addrspace(3) %shared.mem, i32 24
194+
%element.3 = load double, ptr addrspace(3) %offset.3, align 8
195+
196+
; COM: begin chain 1
197+
%vector.build0 = insertelement <8 x double> %vector, double %element.0, i32 %idx0
198+
%vector.build1 = insertelement <8 x double> %vector.build0, double %element.1, i32 %idx1
199+
200+
; COM: interleave a second chain of insertelements
201+
%vector.build1-2 = insertelement <8 x double> %vector.build1, double %element.2, i32 %idx2
202+
203+
; COM: continue chain 1
204+
%vector.build2 = insertelement <8 x double> %vector.build1, double %element.2, i32 %idx2
205+
%vector.build3 = insertelement <8 x double> %vector.build2, double %element.3, i32 %idx3
206+
207+
; COM: save chain 1
208+
%location = getelementptr i8, ptr addrspace(3) %shared.mem, i32 1024
209+
store <8 x double> %vector.build3, ptr addrspace(3) %location, align 64
210+
211+
; COM: save chain 2
212+
%location-2 = getelementptr i8, ptr addrspace(3) %shared.mem, i32 1096
213+
store <8 x double> %vector.build1-2, ptr addrspace(3) %location-2, align 64
214+
ret void
215+
}

0 commit comments

Comments
 (0)