Skip to content

Commit 64f4d5b

Browse files
committed
[NVPTX] address case where element type is non-byte-aligned
1 parent f1cac7f commit 64f4d5b

File tree

2 files changed

+108
-138
lines changed

2 files changed

+108
-138
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23506,6 +23506,15 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
2350623506
// than one dynamic insertelt, since we will not have more than V constant
2350723507
// insertelts, so we will be reducing the total number of stores+loads.
2350823508
if (NumDynamic > 1) {
23509+
// Make the vector elements byte-addressable if they aren't already.
23510+
EVT OldVT = VT;
23511+
EVT EltVT = VT.getVectorElementType();
23512+
bool IsByteSized = EltVT.isByteSized();
23513+
if (!IsByteSized) {
23514+
EltVT = EltVT.changeTypeToInteger().getRoundIntegerType(*DAG.getContext());
23515+
VT = VT.changeElementType(EltVT);
23516+
}
23517+
2350923518
// In cases where the vector is illegal it will be broken down into
2351023519
// parts and stored in parts - we should use the alignment for the
2351123520
// smallest part.
@@ -23518,6 +23527,8 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
2351823527

2351923528
// Save the vector to the stack
2352023529
SDValue InVec = Seq.back()->getOperand(0);
23530+
if (!IsByteSized)
23531+
InVec = DAG.getNode(ISD::ANY_EXTEND, DL, VT, InVec);
2352123532
SDValue Store = DAG.getStore(DAG.getEntryNode(), DL, InVec, StackPtr,
2352223533
PtrInfo, SmallestAlign);
2352323534

@@ -23526,6 +23537,10 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
2352623537
SDValue Elmnt = N->getOperand(1);
2352723538
SDValue Index = N->getOperand(2);
2352823539

23540+
// Check if we have to extend the element type
23541+
if (!IsByteSized && Elmnt.getValueType().bitsLT(EltVT))
23542+
Elmnt = DAG.getNode(ISD::ANY_EXTEND, DL, EltVT, Elmnt);
23543+
2352923544
// Store the new element. This may be larger than the vector element
2353023545
// type, so use a truncating store.
2353123546
SDValue EltPtr =
@@ -23540,7 +23555,8 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
2354023555
// Load the saved vector from the stack
2354123556
SDValue Load =
2354223557
DAG.getLoad(VT, DL, Store, StackPtr, PtrInfo, SmallestAlign);
23543-
return Load.getValue(0);
23558+
SDValue LoadV = Load.getValue(0);
23559+
return IsByteSized ? LoadV : DAG.getAnyExtOrTrunc(LoadV, DL, OldVT);
2354423560
}
2354523561
}
2354623562

llvm/test/CodeGen/NVPTX/insertelt-dynamic.ll

Lines changed: 91 additions & 137 deletions
Original file line numberDiff line numberDiff line change
@@ -100,37 +100,26 @@ define <4 x i32> @dynamic_in_middle(i32 %idx) {
100100
define <4 x i32> @repeated_same_index(i32 %idx) {
101101
; CHECK-LABEL: repeated_same_index(
102102
; CHECK: {
103-
; CHECK-NEXT: .local .align 4 .b8 __local_depot3[32];
103+
; CHECK-NEXT: .local .align 4 .b8 __local_depot3[16];
104104
; CHECK-NEXT: .reg .b64 %SP;
105105
; CHECK-NEXT: .reg .b64 %SPL;
106-
; CHECK-NEXT: .reg .b32 %r<9>;
107-
; CHECK-NEXT: .reg .b64 %rd<8>;
106+
; CHECK-NEXT: .reg .b32 %r<5>;
107+
; CHECK-NEXT: .reg .b64 %rd<6>;
108108
; CHECK-EMPTY:
109109
; CHECK-NEXT: // %bb.0:
110110
; CHECK-NEXT: mov.b64 %SPL, __local_depot3;
111111
; CHECK-NEXT: cvta.local.u64 %SP, %SPL;
112112
; CHECK-NEXT: ld.param.b32 %rd1, [repeated_same_index_param_0];
113113
; CHECK-NEXT: and.b64 %rd2, %rd1, 3;
114114
; CHECK-NEXT: shl.b64 %rd3, %rd2, 2;
115-
; CHECK-NEXT: add.u64 %rd4, %SP, 16;
115+
; CHECK-NEXT: add.u64 %rd4, %SP, 0;
116116
; CHECK-NEXT: add.s64 %rd5, %rd4, %rd3;
117-
; CHECK-NEXT: add.u64 %rd6, %SP, 0;
118-
; CHECK-NEXT: add.s64 %rd7, %rd6, %rd3;
119-
; CHECK-NEXT: st.b32 [%rd7], 10;
120-
; CHECK-NEXT: ld.b32 %r1, [%SP];
121-
; CHECK-NEXT: ld.b32 %r2, [%SP+4];
122-
; CHECK-NEXT: ld.b32 %r3, [%SP+8];
123-
; CHECK-NEXT: ld.b32 %r4, [%SP+12];
124-
; CHECK-NEXT: st.b32 [%SP+28], %r4;
125-
; CHECK-NEXT: st.b32 [%SP+24], %r3;
126-
; CHECK-NEXT: st.b32 [%SP+20], %r2;
127-
; CHECK-NEXT: st.b32 [%SP+16], %r1;
128117
; CHECK-NEXT: st.b32 [%rd5], 20;
129-
; CHECK-NEXT: ld.b32 %r5, [%SP+28];
130-
; CHECK-NEXT: ld.b32 %r6, [%SP+24];
131-
; CHECK-NEXT: ld.b32 %r7, [%SP+20];
132-
; CHECK-NEXT: ld.b32 %r8, [%SP+16];
133-
; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5};
118+
; CHECK-NEXT: ld.b32 %r1, [%SP+12];
119+
; CHECK-NEXT: ld.b32 %r2, [%SP+8];
120+
; CHECK-NEXT: ld.b32 %r3, [%SP+4];
121+
; CHECK-NEXT: ld.b32 %r4, [%SP];
122+
; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r4, %r3, %r2, %r1};
134123
; CHECK-NEXT: ret;
135124
%v0 = insertelement <4 x i32> poison, i32 10, i32 %idx
136125
%v1 = insertelement <4 x i32> %v0, i32 20, i32 %idx
@@ -141,11 +130,11 @@ define <4 x i32> @repeated_same_index(i32 %idx) {
141130
define <4 x i32> @multiple_dynamic(i32 %idx0, i32 %idx1) {
142131
; CHECK-LABEL: multiple_dynamic(
143132
; CHECK: {
144-
; CHECK-NEXT: .local .align 4 .b8 __local_depot4[32];
133+
; CHECK-NEXT: .local .align 4 .b8 __local_depot4[16];
145134
; CHECK-NEXT: .reg .b64 %SP;
146135
; CHECK-NEXT: .reg .b64 %SPL;
147-
; CHECK-NEXT: .reg .b32 %r<9>;
148-
; CHECK-NEXT: .reg .b64 %rd<11>;
136+
; CHECK-NEXT: .reg .b32 %r<5>;
137+
; CHECK-NEXT: .reg .b64 %rd<10>;
149138
; CHECK-EMPTY:
150139
; CHECK-NEXT: // %bb.0:
151140
; CHECK-NEXT: mov.b64 %SPL, __local_depot4;
@@ -159,22 +148,13 @@ define <4 x i32> @multiple_dynamic(i32 %idx0, i32 %idx1) {
159148
; CHECK-NEXT: ld.param.b32 %rd6, [multiple_dynamic_param_1];
160149
; CHECK-NEXT: and.b64 %rd7, %rd6, 3;
161150
; CHECK-NEXT: shl.b64 %rd8, %rd7, 2;
162-
; CHECK-NEXT: add.u64 %rd9, %SP, 16;
163-
; CHECK-NEXT: add.s64 %rd10, %rd9, %rd8;
164-
; CHECK-NEXT: ld.b32 %r1, [%SP];
165-
; CHECK-NEXT: ld.b32 %r2, [%SP+4];
166-
; CHECK-NEXT: ld.b32 %r3, [%SP+8];
167-
; CHECK-NEXT: ld.b32 %r4, [%SP+12];
168-
; CHECK-NEXT: st.b32 [%SP+28], %r4;
169-
; CHECK-NEXT: st.b32 [%SP+24], %r3;
170-
; CHECK-NEXT: st.b32 [%SP+20], %r2;
171-
; CHECK-NEXT: st.b32 [%SP+16], %r1;
172-
; CHECK-NEXT: st.b32 [%rd10], 20;
173-
; CHECK-NEXT: ld.b32 %r5, [%SP+28];
174-
; CHECK-NEXT: ld.b32 %r6, [%SP+24];
175-
; CHECK-NEXT: ld.b32 %r7, [%SP+20];
176-
; CHECK-NEXT: ld.b32 %r8, [%SP+16];
177-
; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5};
151+
; CHECK-NEXT: add.s64 %rd9, %rd4, %rd8;
152+
; CHECK-NEXT: st.b32 [%rd9], 20;
153+
; CHECK-NEXT: ld.b32 %r1, [%SP+12];
154+
; CHECK-NEXT: ld.b32 %r2, [%SP+8];
155+
; CHECK-NEXT: ld.b32 %r3, [%SP+4];
156+
; CHECK-NEXT: ld.b32 %r4, [%SP];
157+
; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r4, %r3, %r2, %r1};
178158
; CHECK-NEXT: ret;
179159
%v0 = insertelement <4 x i32> poison, i32 10, i32 %idx0
180160
%v1 = insertelement <4 x i32> %v0, i32 20, i32 %idx1
@@ -185,11 +165,11 @@ define <4 x i32> @multiple_dynamic(i32 %idx0, i32 %idx1) {
185165
define <4 x i32> @all_dynamic(i32 %idx0, i32 %idx1, i32 %idx2, i32 %idx3) {
186166
; CHECK-LABEL: all_dynamic(
187167
; CHECK: {
188-
; CHECK-NEXT: .local .align 4 .b8 __local_depot5[64];
168+
; CHECK-NEXT: .local .align 4 .b8 __local_depot5[16];
189169
; CHECK-NEXT: .reg .b64 %SP;
190170
; CHECK-NEXT: .reg .b64 %SPL;
191-
; CHECK-NEXT: .reg .b32 %r<17>;
192-
; CHECK-NEXT: .reg .b64 %rd<21>;
171+
; CHECK-NEXT: .reg .b32 %r<5>;
172+
; CHECK-NEXT: .reg .b64 %rd<18>;
193173
; CHECK-EMPTY:
194174
; CHECK-NEXT: // %bb.0:
195175
; CHECK-NEXT: mov.b64 %SPL, __local_depot5;
@@ -199,54 +179,27 @@ define <4 x i32> @all_dynamic(i32 %idx0, i32 %idx1, i32 %idx2, i32 %idx3) {
199179
; CHECK-NEXT: shl.b64 %rd3, %rd2, 2;
200180
; CHECK-NEXT: add.u64 %rd4, %SP, 0;
201181
; CHECK-NEXT: add.s64 %rd5, %rd4, %rd3;
202-
; CHECK-NEXT: st.b32 [%rd5], 10;
203182
; CHECK-NEXT: ld.param.b32 %rd6, [all_dynamic_param_1];
204183
; CHECK-NEXT: and.b64 %rd7, %rd6, 3;
205184
; CHECK-NEXT: shl.b64 %rd8, %rd7, 2;
206-
; CHECK-NEXT: add.u64 %rd9, %SP, 16;
207-
; CHECK-NEXT: add.s64 %rd10, %rd9, %rd8;
208-
; CHECK-NEXT: ld.b32 %r1, [%SP];
209-
; CHECK-NEXT: ld.b32 %r2, [%SP+4];
210-
; CHECK-NEXT: ld.b32 %r3, [%SP+8];
211-
; CHECK-NEXT: ld.b32 %r4, [%SP+12];
212-
; CHECK-NEXT: st.b32 [%SP+28], %r4;
213-
; CHECK-NEXT: st.b32 [%SP+24], %r3;
214-
; CHECK-NEXT: st.b32 [%SP+20], %r2;
215-
; CHECK-NEXT: st.b32 [%SP+16], %r1;
216-
; CHECK-NEXT: st.b32 [%rd10], 20;
217-
; CHECK-NEXT: ld.param.b32 %rd11, [all_dynamic_param_2];
218-
; CHECK-NEXT: and.b64 %rd12, %rd11, 3;
219-
; CHECK-NEXT: shl.b64 %rd13, %rd12, 2;
220-
; CHECK-NEXT: add.u64 %rd14, %SP, 32;
221-
; CHECK-NEXT: add.s64 %rd15, %rd14, %rd13;
222-
; CHECK-NEXT: ld.b32 %r5, [%SP+16];
223-
; CHECK-NEXT: ld.b32 %r6, [%SP+20];
224-
; CHECK-NEXT: ld.b32 %r7, [%SP+24];
225-
; CHECK-NEXT: ld.b32 %r8, [%SP+28];
226-
; CHECK-NEXT: st.b32 [%SP+44], %r8;
227-
; CHECK-NEXT: st.b32 [%SP+40], %r7;
228-
; CHECK-NEXT: st.b32 [%SP+36], %r6;
229-
; CHECK-NEXT: st.b32 [%SP+32], %r5;
230-
; CHECK-NEXT: st.b32 [%rd15], 30;
231-
; CHECK-NEXT: ld.param.b32 %rd16, [all_dynamic_param_3];
232-
; CHECK-NEXT: and.b64 %rd17, %rd16, 3;
233-
; CHECK-NEXT: shl.b64 %rd18, %rd17, 2;
234-
; CHECK-NEXT: add.u64 %rd19, %SP, 48;
235-
; CHECK-NEXT: add.s64 %rd20, %rd19, %rd18;
236-
; CHECK-NEXT: ld.b32 %r9, [%SP+32];
237-
; CHECK-NEXT: ld.b32 %r10, [%SP+36];
238-
; CHECK-NEXT: ld.b32 %r11, [%SP+40];
239-
; CHECK-NEXT: ld.b32 %r12, [%SP+44];
240-
; CHECK-NEXT: st.b32 [%SP+60], %r12;
241-
; CHECK-NEXT: st.b32 [%SP+56], %r11;
242-
; CHECK-NEXT: st.b32 [%SP+52], %r10;
243-
; CHECK-NEXT: st.b32 [%SP+48], %r9;
244-
; CHECK-NEXT: st.b32 [%rd20], 40;
245-
; CHECK-NEXT: ld.b32 %r13, [%SP+60];
246-
; CHECK-NEXT: ld.b32 %r14, [%SP+56];
247-
; CHECK-NEXT: ld.b32 %r15, [%SP+52];
248-
; CHECK-NEXT: ld.b32 %r16, [%SP+48];
249-
; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r16, %r15, %r14, %r13};
185+
; CHECK-NEXT: add.s64 %rd9, %rd4, %rd8;
186+
; CHECK-NEXT: ld.param.b32 %rd10, [all_dynamic_param_2];
187+
; CHECK-NEXT: and.b64 %rd11, %rd10, 3;
188+
; CHECK-NEXT: shl.b64 %rd12, %rd11, 2;
189+
; CHECK-NEXT: add.s64 %rd13, %rd4, %rd12;
190+
; CHECK-NEXT: st.b32 [%rd5], 10;
191+
; CHECK-NEXT: st.b32 [%rd9], 20;
192+
; CHECK-NEXT: st.b32 [%rd13], 30;
193+
; CHECK-NEXT: ld.param.b32 %rd14, [all_dynamic_param_3];
194+
; CHECK-NEXT: and.b64 %rd15, %rd14, 3;
195+
; CHECK-NEXT: shl.b64 %rd16, %rd15, 2;
196+
; CHECK-NEXT: add.s64 %rd17, %rd4, %rd16;
197+
; CHECK-NEXT: st.b32 [%rd17], 40;
198+
; CHECK-NEXT: ld.b32 %r1, [%SP+12];
199+
; CHECK-NEXT: ld.b32 %r2, [%SP+8];
200+
; CHECK-NEXT: ld.b32 %r3, [%SP+4];
201+
; CHECK-NEXT: ld.b32 %r4, [%SP];
202+
; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r4, %r3, %r2, %r1};
250203
; CHECK-NEXT: ret;
251204
%v0 = insertelement <4 x i32> poison, i32 10, i32 %idx0
252205
%v1 = insertelement <4 x i32> %v0, i32 20, i32 %idx1
@@ -260,11 +213,11 @@ define <4 x i32> @all_dynamic(i32 %idx0, i32 %idx1, i32 %idx2, i32 %idx3) {
260213
define <4 x i32> @mix_dynamic_constant(i32 %idx0, i32 %idx1) {
261214
; CHECK-LABEL: mix_dynamic_constant(
262215
; CHECK: {
263-
; CHECK-NEXT: .local .align 4 .b8 __local_depot6[32];
216+
; CHECK-NEXT: .local .align 4 .b8 __local_depot6[16];
264217
; CHECK-NEXT: .reg .b64 %SP;
265218
; CHECK-NEXT: .reg .b64 %SPL;
266-
; CHECK-NEXT: .reg .b32 %r<8>;
267-
; CHECK-NEXT: .reg .b64 %rd<11>;
219+
; CHECK-NEXT: .reg .b32 %r<5>;
220+
; CHECK-NEXT: .reg .b64 %rd<10>;
268221
; CHECK-EMPTY:
269222
; CHECK-NEXT: // %bb.0:
270223
; CHECK-NEXT: mov.b64 %SPL, __local_depot6;
@@ -278,21 +231,14 @@ define <4 x i32> @mix_dynamic_constant(i32 %idx0, i32 %idx1) {
278231
; CHECK-NEXT: ld.param.b32 %rd6, [mix_dynamic_constant_param_1];
279232
; CHECK-NEXT: and.b64 %rd7, %rd6, 3;
280233
; CHECK-NEXT: shl.b64 %rd8, %rd7, 2;
281-
; CHECK-NEXT: add.u64 %rd9, %SP, 16;
282-
; CHECK-NEXT: add.s64 %rd10, %rd9, %rd8;
283-
; CHECK-NEXT: ld.b32 %r1, [%SP];
234+
; CHECK-NEXT: add.s64 %rd9, %rd4, %rd8;
235+
; CHECK-NEXT: st.b32 [%SP+4], 20;
236+
; CHECK-NEXT: st.b32 [%rd9], 30;
237+
; CHECK-NEXT: ld.b32 %r1, [%SP+12];
284238
; CHECK-NEXT: ld.b32 %r2, [%SP+8];
285-
; CHECK-NEXT: ld.b32 %r3, [%SP+12];
286-
; CHECK-NEXT: st.b32 [%SP+28], %r3;
287-
; CHECK-NEXT: st.b32 [%SP+24], %r2;
288-
; CHECK-NEXT: st.b32 [%SP+16], %r1;
289-
; CHECK-NEXT: st.b32 [%SP+20], 20;
290-
; CHECK-NEXT: st.b32 [%rd10], 30;
291-
; CHECK-NEXT: ld.b32 %r4, [%SP+28];
292-
; CHECK-NEXT: ld.b32 %r5, [%SP+24];
293-
; CHECK-NEXT: ld.b32 %r6, [%SP+20];
294-
; CHECK-NEXT: ld.b32 %r7, [%SP+16];
295-
; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r7, %r6, %r5, %r4};
239+
; CHECK-NEXT: ld.b32 %r3, [%SP+4];
240+
; CHECK-NEXT: ld.b32 %r4, [%SP];
241+
; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r4, %r3, %r2, %r1};
296242
; CHECK-NEXT: ret;
297243
%v0 = insertelement <4 x i32> poison, i32 10, i32 %idx0
298244
%v1 = insertelement <4 x i32> %v0, i32 20, i32 1
@@ -356,39 +302,37 @@ define void @overlapping_chains(i32 %idx0, i32 %idx1, ptr %out0, ptr %out1) {
356302
; CHECK-NEXT: .local .align 4 .b8 __local_depot8[32];
357303
; CHECK-NEXT: .reg .b64 %SP;
358304
; CHECK-NEXT: .reg .b64 %SPL;
359-
; CHECK-NEXT: .reg .b32 %r<8>;
360-
; CHECK-NEXT: .reg .b64 %rd<13>;
305+
; CHECK-NEXT: .reg .b32 %r<7>;
306+
; CHECK-NEXT: .reg .b64 %rd<14>;
361307
; CHECK-EMPTY:
362308
; CHECK-NEXT: // %bb.0:
363309
; CHECK-NEXT: mov.b64 %SPL, __local_depot8;
364310
; CHECK-NEXT: cvta.local.u64 %SP, %SPL;
365311
; CHECK-NEXT: ld.param.b32 %rd1, [overlapping_chains_param_0];
366312
; CHECK-NEXT: and.b64 %rd2, %rd1, 3;
367313
; CHECK-NEXT: shl.b64 %rd3, %rd2, 2;
368-
; CHECK-NEXT: add.u64 %rd4, %SP, 0;
314+
; CHECK-NEXT: add.u64 %rd4, %SP, 16;
369315
; CHECK-NEXT: add.s64 %rd5, %rd4, %rd3;
370316
; CHECK-NEXT: st.b32 [%rd5], 10;
371-
; CHECK-NEXT: ld.param.b32 %rd6, [overlapping_chains_param_1];
372-
; CHECK-NEXT: and.b64 %rd7, %rd6, 3;
373-
; CHECK-NEXT: shl.b64 %rd8, %rd7, 2;
374-
; CHECK-NEXT: add.u64 %rd9, %SP, 16;
375-
; CHECK-NEXT: add.s64 %rd10, %rd9, %rd8;
376-
; CHECK-NEXT: ld.b32 %r1, [%SP];
377-
; CHECK-NEXT: ld.b32 %r2, [%SP+8];
317+
; CHECK-NEXT: add.u64 %rd6, %SP, 0;
318+
; CHECK-NEXT: add.s64 %rd7, %rd6, %rd3;
319+
; CHECK-NEXT: ld.b32 %r1, [%SP+28];
320+
; CHECK-NEXT: ld.b32 %r2, [%SP+16];
321+
; CHECK-NEXT: ld.param.b64 %rd8, [overlapping_chains_param_2];
322+
; CHECK-NEXT: st.b32 [%rd7], 10;
323+
; CHECK-NEXT: ld.param.b32 %rd9, [overlapping_chains_param_1];
324+
; CHECK-NEXT: and.b64 %rd10, %rd9, 3;
325+
; CHECK-NEXT: shl.b64 %rd11, %rd10, 2;
326+
; CHECK-NEXT: add.s64 %rd12, %rd6, %rd11;
327+
; CHECK-NEXT: st.b32 [%SP+4], 20;
328+
; CHECK-NEXT: st.b32 [%rd12], 30;
329+
; CHECK-NEXT: ld.param.b64 %rd13, [overlapping_chains_param_3];
378330
; CHECK-NEXT: ld.b32 %r3, [%SP+12];
379-
; CHECK-NEXT: ld.param.b64 %rd11, [overlapping_chains_param_2];
380-
; CHECK-NEXT: st.b32 [%SP+28], %r3;
381-
; CHECK-NEXT: st.b32 [%SP+24], %r2;
382-
; CHECK-NEXT: st.b32 [%SP+16], %r1;
383-
; CHECK-NEXT: st.b32 [%SP+20], 20;
384-
; CHECK-NEXT: st.b32 [%rd10], 30;
385-
; CHECK-NEXT: ld.param.b64 %rd12, [overlapping_chains_param_3];
386-
; CHECK-NEXT: ld.b32 %r4, [%SP+28];
387-
; CHECK-NEXT: ld.b32 %r5, [%SP+24];
388-
; CHECK-NEXT: ld.b32 %r6, [%SP+20];
389-
; CHECK-NEXT: ld.b32 %r7, [%SP+16];
390-
; CHECK-NEXT: st.v4.b32 [%rd11], {%r1, 20, 40, %r3};
391-
; CHECK-NEXT: st.v4.b32 [%rd12], {%r7, %r6, %r5, %r4};
331+
; CHECK-NEXT: ld.b32 %r4, [%SP+8];
332+
; CHECK-NEXT: ld.b32 %r5, [%SP+4];
333+
; CHECK-NEXT: ld.b32 %r6, [%SP];
334+
; CHECK-NEXT: st.v4.b32 [%rd8], {%r2, 20, 40, %r1};
335+
; CHECK-NEXT: st.v4.b32 [%rd13], {%r6, %r5, %r4, %r3};
392336
; CHECK-NEXT: ret;
393337
%v0 = insertelement <4 x i32> poison, i32 10, i32 %idx0
394338
%v1 = insertelement <4 x i32> %v0, i32 20, i32 1
@@ -775,18 +719,28 @@ define <8 x i6> @dynamic_i6(i32 %idx) {
775719
define <4 x i3> @multiple_dynamic_i3(i32 %idx0, i32 %idx1) {
776720
; CHECK-LABEL: multiple_dynamic_i3(
777721
; CHECK: {
778-
; CHECK-NEXT: .reg .b32 %r<9>;
722+
; CHECK-NEXT: .local .align 4 .b8 __local_depot16[4];
723+
; CHECK-NEXT: .reg .b64 %SP;
724+
; CHECK-NEXT: .reg .b64 %SPL;
725+
; CHECK-NEXT: .reg .b32 %r<3>;
726+
; CHECK-NEXT: .reg .b64 %rd<8>;
779727
; CHECK-EMPTY:
780728
; CHECK-NEXT: // %bb.0:
781-
; CHECK-NEXT: ld.param.b32 %r1, [multiple_dynamic_i3_param_0];
782-
; CHECK-NEXT: shl.b32 %r2, %r1, 3;
783-
; CHECK-NEXT: bfi.b32 %r3, 1, %r4, %r2, 8;
784-
; CHECK-NEXT: ld.param.b32 %r5, [multiple_dynamic_i3_param_1];
785-
; CHECK-NEXT: shl.b32 %r6, %r5, 3;
786-
; CHECK-NEXT: bfi.b32 %r7, 2, %r3, %r6, 8;
787-
; CHECK-NEXT: st.param.b16 [func_retval0], %r7;
788-
; CHECK-NEXT: shr.u32 %r8, %r7, 16;
789-
; CHECK-NEXT: st.param.b16 [func_retval0+2], %r8;
729+
; CHECK-NEXT: mov.b64 %SPL, __local_depot16;
730+
; CHECK-NEXT: cvta.local.u64 %SP, %SPL;
731+
; CHECK-NEXT: ld.param.b32 %rd1, [multiple_dynamic_i3_param_0];
732+
; CHECK-NEXT: and.b64 %rd2, %rd1, 3;
733+
; CHECK-NEXT: add.u64 %rd3, %SP, 0;
734+
; CHECK-NEXT: or.b64 %rd4, %rd3, %rd2;
735+
; CHECK-NEXT: st.b8 [%rd4], 1;
736+
; CHECK-NEXT: ld.param.b32 %rd5, [multiple_dynamic_i3_param_1];
737+
; CHECK-NEXT: and.b64 %rd6, %rd5, 3;
738+
; CHECK-NEXT: or.b64 %rd7, %rd3, %rd6;
739+
; CHECK-NEXT: st.b8 [%rd7], 2;
740+
; CHECK-NEXT: ld.b32 %r1, [%SP];
741+
; CHECK-NEXT: st.param.b16 [func_retval0], %r1;
742+
; CHECK-NEXT: shr.u32 %r2, %r1, 16;
743+
; CHECK-NEXT: st.param.b16 [func_retval0+2], %r2;
790744
; CHECK-NEXT: ret;
791745
%v0 = insertelement <4 x i3> poison, i3 1, i32 %idx0
792746
%v1 = insertelement <4 x i3> %v0, i3 2, i32 %idx1

0 commit comments

Comments
 (0)