Skip to content

Commit 6f6d389

Browse files
committed
[SplitKit] Only copy live lanes
When splitting a live interval with subranges, only insert copies for the lanes that are live at the point of the split. This avoids some unnecessary copies and fixes a problem where copying dead lanes was generating MIR that failed verification. The test case for this is test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir. Without this fix, some earlier live range splitting would create %430: %430 [256r,848r:0)[848r,2584r:1) 0@256r 1@848r L0000000000000003 [848r,2584r:0) 0@848r L0000000000000030 [256r,2584r:0) 0@256r weight:1.480938e-03 ... 256B undef %430.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %20.sub1:vreg_128, implicit $exec ... 848B %430.sub0:vreg_128 = V_AND_B32_e32 %92:sreg_32, %20.sub1:vreg_128, implicit $exec ... 2584B %431:vreg_128 = COPY %430:vreg_128 Then RAGreedy::tryLocalSplit would split %430 into %432 and %433 just before 848B giving: %432 [256r,844r:0) 0@256r L0000000000000030 [256r,844r:0) 0@256r weight:3.066802e-03 %433 [844r,848r:0)[848r,2584r:1) 0@844r 1@848r L0000000000000030 [844r,2584r:0) 0@844r L0000000000000003 [844r,844d:0)[848r,2584r:1) 0@844r 1@848r weight:2.831776e-03 ... 256B undef %432.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %20.sub1:vreg_128, implicit $exec ... 844B undef %433.sub0:vreg_128 = COPY %432.sub0:vreg_128 { internal %433.sub2:vreg_128 = COPY %432.sub2:vreg_128 848B } %433.sub0:vreg_128 = V_AND_B32_e32 %92:sreg_32, %20.sub1:vreg_128, implicit $exec ... 2584B %431:vreg_128 = COPY %433:vreg_128 Note that the copy from %432 to %433 at 844B is a curious bundle-without-a-BUNDLE-instruction that SplitKit creates deliberately, and it includes a copy of .sub0 which is not live at this point, and that causes it to fail verification: *** Bad machine code: No live subrange at use *** - function: zextload_global_v64i16_to_v64i64 - basic block: %bb.0 (0x7faed48) [0B;2848B) - instruction: 844B undef %433.sub0:vreg_128 = COPY %432.sub0:vreg_128 - operand 1: %432.sub0:vreg_128 - interval: %432 [256r,844r:0) 0@256r L0000000000000030 [256r,844r:0) 0@256r weight:3.066802e-03 - at: 844B Using real bundles with a BUNDLE instruction might also fix this problem, but the current fix is less invasive and also avoids some unnecessary copies. https://bugs.llvm.org/show_bug.cgi?id=47492 Differential Revision: https://reviews.llvm.org/D87757
1 parent d49707c commit 6f6d389

File tree

5 files changed

+572
-53
lines changed

5 files changed

+572
-53
lines changed

llvm/lib/CodeGen/SplitKit.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -649,10 +649,13 @@ VNInfo *SplitEditor::defFromParent(unsigned RegIdx,
649649
}
650650
if (!DidRemat) {
651651
LaneBitmask LaneMask;
652-
if (LI->hasSubRanges()) {
652+
if (OrigLI.hasSubRanges()) {
653653
LaneMask = LaneBitmask::getNone();
654-
for (LiveInterval::SubRange &S : LI->subranges())
655-
LaneMask |= S.LaneMask;
654+
for (LiveInterval::SubRange &S : OrigLI.subranges()) {
655+
if (S.liveAt(UseIdx))
656+
LaneMask |= S.LaneMask;
657+
}
658+
assert(LaneMask.any() && "Interval has no live subranges");
656659
} else {
657660
LaneMask = LaneBitmask::getAll();
658661
}

llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ entry:
3939
; GFX6-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[{{[0-9:]+}}], s32
4040
; GFX6-NEXT: s_sub_u32 s32, s32, 0x[[OFFSET:[0-9]+]]
4141
; GFX6: NumSgprs: 48
42-
; GFX6: ScratchSize: 8624
42+
; GFX6: ScratchSize: 8608
4343
define amdgpu_kernel void @test_limited_sgpr(<64 x i32> addrspace(1)* %out, <64 x i32> addrspace(1)* %in) #0 {
4444
entry:
4545
%lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)

llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir

Lines changed: 37 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -16,17 +16,11 @@ body: |
1616
; RA: [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
1717
; RA: undef %5.sub1:sgpr_1024 = S_MOV_B32 -1
1818
; RA: %5.sub0:sgpr_1024 = S_MOV_B32 -1
19-
; RA: undef %4.sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15:sgpr_1024 = COPY %5.sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15 {
20-
; RA: internal %4.sub12_sub13_sub14_sub15_sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23_sub24_sub25_sub26_sub27:sgpr_1024 = COPY %5.sub12_sub13_sub14_sub15_sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23_sub24_sub25_sub26_sub27
21-
; RA: internal %4.sub28_sub29:sgpr_1024 = COPY %5.sub28_sub29
22-
; RA: }
19+
; RA: undef %4.sub0_sub1:sgpr_1024 = COPY %5.sub0_sub1
2320
; RA: undef %3.sub0:sgpr_1024 = S_MOV_B32 0
2421
; RA: bb.1:
2522
; RA: successors: %bb.2(0x80000000)
26-
; RA: undef %6.sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15:sgpr_1024 = COPY %4.sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15 {
27-
; RA: internal %6.sub12_sub13_sub14_sub15_sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23_sub24_sub25_sub26_sub27:sgpr_1024 = COPY %4.sub12_sub13_sub14_sub15_sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23_sub24_sub25_sub26_sub27
28-
; RA: internal %6.sub28_sub29:sgpr_1024 = COPY %4.sub28_sub29
29-
; RA: }
23+
; RA: undef %6.sub0_sub1:sgpr_1024 = COPY %4.sub0_sub1
3024
; RA: %6.sub2:sgpr_1024 = COPY %6.sub0
3125
; RA: %6.sub3:sgpr_1024 = COPY %6.sub1
3226
; RA: %6.sub4:sgpr_1024 = COPY %6.sub0
@@ -55,10 +49,7 @@ body: |
5549
; RA: %6.sub27:sgpr_1024 = COPY %6.sub1
5650
; RA: %6.sub28:sgpr_1024 = COPY %6.sub0
5751
; RA: %6.sub29:sgpr_1024 = COPY %6.sub1
58-
; RA: undef %4.sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15:sgpr_1024 = COPY %6.sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15 {
59-
; RA: internal %4.sub12_sub13_sub14_sub15_sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23_sub24_sub25_sub26_sub27:sgpr_1024 = COPY %6.sub12_sub13_sub14_sub15_sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23_sub24_sub25_sub26_sub27
60-
; RA: internal %4.sub28_sub29:sgpr_1024 = COPY %6.sub28_sub29
61-
; RA: }
52+
; RA: undef %4.sub0_sub1:sgpr_1024 = COPY %6.sub0_sub1
6253
; RA: %3.sub1:sgpr_1024 = COPY %3.sub0
6354
; RA: %3.sub2:sgpr_1024 = COPY %3.sub0
6455
; RA: %3.sub3:sgpr_1024 = COPY %3.sub0
@@ -102,40 +93,40 @@ body: |
10293
; VR: renamable $sgpr68 = S_MOV_B32 -1
10394
; VR: renamable $sgpr36 = S_MOV_B32 0
10495
; VR: renamable $sgpr34_sgpr35 = IMPLICIT_DEF
105-
; VR: renamable $sgpr98_sgpr99 = IMPLICIT_DEF
106-
; VR: renamable $sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = KILL undef renamable $sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
107-
; VR: renamable $sgpr96_sgpr97 = KILL undef renamable $sgpr96_sgpr97
96+
; VR: renamable $sgpr70_sgpr71 = IMPLICIT_DEF
10897
; VR: bb.1:
10998
; VR: successors: %bb.2(0x80000000)
110-
; VR: liveins: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67:0x0000000000000003, $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99:0x0FFFFFFFFFFFFFFF, $sgpr34_sgpr35, $sgpr98_sgpr99
111-
; VR: renamable $sgpr70 = COPY renamable $sgpr68
112-
; VR: renamable $sgpr71 = COPY renamable $sgpr69
113-
; VR: renamable $sgpr72 = COPY renamable $sgpr68
114-
; VR: renamable $sgpr73 = COPY renamable $sgpr69
115-
; VR: renamable $sgpr74 = COPY renamable $sgpr68
116-
; VR: renamable $sgpr75 = COPY renamable $sgpr69
117-
; VR: renamable $sgpr76 = COPY renamable $sgpr68
118-
; VR: renamable $sgpr77 = COPY renamable $sgpr69
119-
; VR: renamable $sgpr78 = COPY renamable $sgpr68
120-
; VR: renamable $sgpr79 = COPY renamable $sgpr69
121-
; VR: renamable $sgpr80 = COPY renamable $sgpr68
122-
; VR: renamable $sgpr81 = COPY renamable $sgpr69
123-
; VR: renamable $sgpr82 = COPY renamable $sgpr68
124-
; VR: renamable $sgpr83 = COPY renamable $sgpr69
125-
; VR: renamable $sgpr84 = COPY renamable $sgpr68
126-
; VR: renamable $sgpr85 = COPY renamable $sgpr69
127-
; VR: renamable $sgpr86 = COPY renamable $sgpr68
128-
; VR: renamable $sgpr87 = COPY renamable $sgpr69
129-
; VR: renamable $sgpr88 = COPY renamable $sgpr68
130-
; VR: renamable $sgpr89 = COPY renamable $sgpr69
131-
; VR: renamable $sgpr90 = COPY renamable $sgpr68
132-
; VR: renamable $sgpr91 = COPY renamable $sgpr69
133-
; VR: renamable $sgpr92 = COPY renamable $sgpr68
134-
; VR: renamable $sgpr93 = COPY renamable $sgpr69
135-
; VR: renamable $sgpr94 = COPY renamable $sgpr68
136-
; VR: renamable $sgpr95 = COPY renamable $sgpr69
137-
; VR: renamable $sgpr96 = COPY renamable $sgpr68
138-
; VR: renamable $sgpr97 = COPY renamable $sgpr69
99+
; VR: liveins: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67:0x0000000000000003, $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99:0x000000000000000F, $sgpr34_sgpr35, $sgpr70_sgpr71
100+
; VR: renamable $sgpr40_sgpr41 = COPY killed renamable $sgpr68_sgpr69
101+
; VR: renamable $sgpr42 = COPY renamable $sgpr40
102+
; VR: renamable $sgpr43 = COPY renamable $sgpr41
103+
; VR: renamable $sgpr44 = COPY renamable $sgpr40
104+
; VR: renamable $sgpr45 = COPY renamable $sgpr41
105+
; VR: renamable $sgpr46 = COPY renamable $sgpr40
106+
; VR: renamable $sgpr47 = COPY renamable $sgpr41
107+
; VR: renamable $sgpr48 = COPY renamable $sgpr40
108+
; VR: renamable $sgpr49 = COPY renamable $sgpr41
109+
; VR: renamable $sgpr50 = COPY renamable $sgpr40
110+
; VR: renamable $sgpr51 = COPY renamable $sgpr41
111+
; VR: renamable $sgpr52 = COPY renamable $sgpr40
112+
; VR: renamable $sgpr53 = COPY renamable $sgpr41
113+
; VR: renamable $sgpr54 = COPY renamable $sgpr40
114+
; VR: renamable $sgpr55 = COPY renamable $sgpr41
115+
; VR: renamable $sgpr56 = COPY renamable $sgpr40
116+
; VR: renamable $sgpr57 = COPY renamable $sgpr41
117+
; VR: renamable $sgpr58 = COPY renamable $sgpr40
118+
; VR: renamable $sgpr59 = COPY renamable $sgpr41
119+
; VR: renamable $sgpr60 = COPY renamable $sgpr40
120+
; VR: renamable $sgpr61 = COPY renamable $sgpr41
121+
; VR: renamable $sgpr62 = COPY renamable $sgpr40
122+
; VR: renamable $sgpr63 = COPY renamable $sgpr41
123+
; VR: renamable $sgpr64 = COPY renamable $sgpr40
124+
; VR: renamable $sgpr65 = COPY renamable $sgpr41
125+
; VR: renamable $sgpr66 = COPY renamable $sgpr40
126+
; VR: renamable $sgpr67 = COPY renamable $sgpr41
127+
; VR: renamable $sgpr68 = COPY renamable $sgpr40
128+
; VR: renamable $sgpr69 = COPY renamable $sgpr41
129+
; VR: renamable $sgpr68_sgpr69 = COPY killed renamable $sgpr40_sgpr41
139130
; VR: renamable $sgpr37 = COPY renamable $sgpr36
140131
; VR: renamable $sgpr38 = COPY renamable $sgpr36
141132
; VR: renamable $sgpr39 = COPY renamable $sgpr36
@@ -169,8 +160,8 @@ body: |
169160
; VR: renamable $sgpr67 = COPY renamable $sgpr36
170161
; VR: bb.2:
171162
; VR: successors: %bb.1(0x40000000), %bb.2(0x40000000)
172-
; VR: liveins: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67:0x0000000000000003, $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99:0x0FFFFFFFFFFFFFFF, $sgpr34_sgpr35, $sgpr98_sgpr99
173-
; VR: S_NOP 0, csr_amdgpu_highregs, implicit renamable $sgpr34_sgpr35, implicit renamable $sgpr98_sgpr99
163+
; VR: liveins: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67:0x0000000000000003, $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99:0x000000000000000F, $sgpr34_sgpr35, $sgpr70_sgpr71
164+
; VR: S_NOP 0, csr_amdgpu_highregs, implicit renamable $sgpr34_sgpr35, implicit renamable $sgpr70_sgpr71
174165
; VR: S_CBRANCH_VCCNZ %bb.1, implicit undef $vcc
175166
; VR: S_BRANCH %bb.2
176167
bb.0:

0 commit comments

Comments
 (0)