Skip to content

Commit 481a9f7

Browse files
committed
[NVPTX] simplify profitibility check
1 parent 79a6a2a commit 481a9f7

File tree

2 files changed

+12
-46
lines changed

2 files changed

+12
-46
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23502,12 +23502,10 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
2350223502
NumDynamic += !isa<ConstantSDNode>(InVec.getOperand(2));
2350323503
}
2350423504

23505-
// We will lower every insertelt in the sequence to a store. In the
23506-
// default handling, only dynamic insertelts in the sequence will be
23507-
// lowered to a store (+ vector save/load for each). Check that our
23508-
// approach reduces the total number of loads and stores over the default.
23509-
if (2 * VT.getVectorMinNumElements() + Seq.size() <
23510-
NumDynamic * (2 * VT.getVectorMinNumElements() + 1)) {
23505+
// It always and only makes sense to lower this sequence when we have more
23506+
// than one dynamic insertelt, since we will not have more than V constant
23507+
// insertelts, so we will be reducing the total number of stores+loads.
23508+
if (NumDynamic > 1) {
2351123509
// In cases where the vector is illegal it will be broken down into
2351223510
// parts and stored in parts - we should use the alignment for the
2351323511
// smallest part.

llvm/test/CodeGen/NVPTX/insertelt-dynamic.ll

Lines changed: 8 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -210,8 +210,8 @@ define <4 x i32> @all_dynamic(i32 %idx0, i32 %idx1, i32 %idx2, i32 %idx3) {
210210

211211
; Test mixed constant and dynamic insertelts with high ratio of dynamic ones.
212212
; Should lower all insertelts to stores.
213-
define <4 x i32> @mix_high_dynamic_ratio(i32 %idx0, i32 %idx1) {
214-
; CHECK-LABEL: mix_high_dynamic_ratio(
213+
define <4 x i32> @mix_dynamic_constant(i32 %idx0, i32 %idx1) {
214+
; CHECK-LABEL: mix_dynamic_constant(
215215
; CHECK: {
216216
; CHECK-NEXT: .local .align 4 .b8 __local_depot6[16];
217217
; CHECK-NEXT: .reg .b64 %SP;
@@ -222,13 +222,13 @@ define <4 x i32> @mix_high_dynamic_ratio(i32 %idx0, i32 %idx1) {
222222
; CHECK-NEXT: // %bb.0:
223223
; CHECK-NEXT: mov.b64 %SPL, __local_depot6;
224224
; CHECK-NEXT: cvta.local.u64 %SP, %SPL;
225-
; CHECK-NEXT: ld.param.b32 %rd1, [mix_high_dynamic_ratio_param_0];
225+
; CHECK-NEXT: ld.param.b32 %rd1, [mix_dynamic_constant_param_0];
226226
; CHECK-NEXT: and.b64 %rd2, %rd1, 3;
227227
; CHECK-NEXT: shl.b64 %rd3, %rd2, 2;
228228
; CHECK-NEXT: add.u64 %rd4, %SP, 0;
229229
; CHECK-NEXT: add.s64 %rd5, %rd4, %rd3;
230230
; CHECK-NEXT: st.b32 [%rd5], 10;
231-
; CHECK-NEXT: ld.param.b32 %rd6, [mix_high_dynamic_ratio_param_1];
231+
; CHECK-NEXT: ld.param.b32 %rd6, [mix_dynamic_constant_param_1];
232232
; CHECK-NEXT: and.b64 %rd7, %rd6, 3;
233233
; CHECK-NEXT: shl.b64 %rd8, %rd7, 2;
234234
; CHECK-NEXT: add.s64 %rd9, %rd4, %rd8;
@@ -246,50 +246,18 @@ define <4 x i32> @mix_high_dynamic_ratio(i32 %idx0, i32 %idx1) {
246246
ret <4 x i32> %v2
247247
}
248248

249-
; Test mixed constant and dynamic insertelts with low ratio of dynamic ones.
250-
; Should handle dynamic insertelt individually.
251-
define <4 x i32> @mix_low_dynamic_ratio(i32 %idx) {
252-
; CHECK-LABEL: mix_low_dynamic_ratio(
253-
; CHECK: {
254-
; CHECK-NEXT: .local .align 4 .b8 __local_depot7[16];
255-
; CHECK-NEXT: .reg .b64 %SP;
256-
; CHECK-NEXT: .reg .b64 %SPL;
257-
; CHECK-NEXT: .reg .b32 %r<3>;
258-
; CHECK-NEXT: .reg .b64 %rd<6>;
259-
; CHECK-EMPTY:
260-
; CHECK-NEXT: // %bb.0:
261-
; CHECK-NEXT: mov.b64 %SPL, __local_depot7;
262-
; CHECK-NEXT: cvta.local.u64 %SP, %SPL;
263-
; CHECK-NEXT: ld.param.b32 %rd1, [mix_low_dynamic_ratio_param_0];
264-
; CHECK-NEXT: and.b64 %rd2, %rd1, 3;
265-
; CHECK-NEXT: shl.b64 %rd3, %rd2, 2;
266-
; CHECK-NEXT: add.u64 %rd4, %SP, 0;
267-
; CHECK-NEXT: add.s64 %rd5, %rd4, %rd3;
268-
; CHECK-NEXT: st.b32 [%SP], 10;
269-
; CHECK-NEXT: st.b32 [%rd5], 20;
270-
; CHECK-NEXT: ld.b32 %r1, [%SP+4];
271-
; CHECK-NEXT: ld.b32 %r2, [%SP];
272-
; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r2, %r1, 30, 40};
273-
; CHECK-NEXT: ret;
274-
%v0 = insertelement <4 x i32> poison, i32 10, i32 0
275-
%v1 = insertelement <4 x i32> %v0, i32 20, i32 %idx
276-
%v2 = insertelement <4 x i32> %v1, i32 30, i32 2
277-
%v3 = insertelement <4 x i32> %v2, i32 40, i32 3
278-
ret <4 x i32> %v3
279-
}
280-
281249
; Test two separate chains that don't interfere
282250
define void @two_separate_chains(i32 %idx0, i32 %idx1, ptr %out0, ptr %out1) {
283251
; CHECK-LABEL: two_separate_chains(
284252
; CHECK: {
285-
; CHECK-NEXT: .local .align 4 .b8 __local_depot8[32];
253+
; CHECK-NEXT: .local .align 4 .b8 __local_depot7[32];
286254
; CHECK-NEXT: .reg .b64 %SP;
287255
; CHECK-NEXT: .reg .b64 %SPL;
288256
; CHECK-NEXT: .reg .b32 %r<7>;
289257
; CHECK-NEXT: .reg .b64 %rd<13>;
290258
; CHECK-EMPTY:
291259
; CHECK-NEXT: // %bb.0:
292-
; CHECK-NEXT: mov.b64 %SPL, __local_depot8;
260+
; CHECK-NEXT: mov.b64 %SPL, __local_depot7;
293261
; CHECK-NEXT: cvta.local.u64 %SP, %SPL;
294262
; CHECK-NEXT: ld.param.b32 %rd1, [two_separate_chains_param_0];
295263
; CHECK-NEXT: and.b64 %rd2, %rd1, 3;
@@ -331,14 +299,14 @@ define void @two_separate_chains(i32 %idx0, i32 %idx1, ptr %out0, ptr %out1) {
331299
define void @overlapping_chains(i32 %idx0, i32 %idx1, ptr %out0, ptr %out1) {
332300
; CHECK-LABEL: overlapping_chains(
333301
; CHECK: {
334-
; CHECK-NEXT: .local .align 4 .b8 __local_depot9[32];
302+
; CHECK-NEXT: .local .align 4 .b8 __local_depot8[32];
335303
; CHECK-NEXT: .reg .b64 %SP;
336304
; CHECK-NEXT: .reg .b64 %SPL;
337305
; CHECK-NEXT: .reg .b32 %r<7>;
338306
; CHECK-NEXT: .reg .b64 %rd<14>;
339307
; CHECK-EMPTY:
340308
; CHECK-NEXT: // %bb.0:
341-
; CHECK-NEXT: mov.b64 %SPL, __local_depot9;
309+
; CHECK-NEXT: mov.b64 %SPL, __local_depot8;
342310
; CHECK-NEXT: cvta.local.u64 %SP, %SPL;
343311
; CHECK-NEXT: ld.param.b32 %rd1, [overlapping_chains_param_0];
344312
; CHECK-NEXT: and.b64 %rd2, %rd1, 3;

0 commit comments

Comments
 (0)