Skip to content

Commit dd83334

Browse files
committed
[AMDGPU] Ensure non-reserved CSR spilled regs are live-in
Fixes: ``` *** Bad machine code: Using an undefined physical register *** - function: widget - basic block: %bb.0 bb (0x564092cbe140) - instruction: $vgpr63 = V_ACCVGPR_READ_B32_e64 killed $agpr13, implicit $exec - operand 1: killed $agpr13 LLVM ERROR: Found 1 machine code errors. ``` The detailed sequence of events that led to this change: 1. MachineVerifier fails because $agpr13 is not defined on line 19 below: ``` 1: bb.0.bb: 2: successors: %bb.1(0x80000000); %bb.1(100.00%) 3: liveins: $agpr14, $agpr15, $sgpr12, $sgpr13, $sgpr14, \ 4: $sgpr15, $sgpr30, $sgpr31, $sgpr34, $sgpr35, \ 5: $sgpr36, $sgpr37, $sgpr38, $sgpr39, $sgpr48, \ 6: $sgpr49, $sgpr50, $sgpr51, $sgpr52, $sgpr53, \ 7: $sgpr54, $sgpr55, $sgpr64, $sgpr65, $sgpr66, \ 8: $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, \ 9: $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, \ 10: $sgpr85, $sgpr86, $sgpr87, $sgpr96, $sgpr97, \ 11: $sgpr98, $sgpr99, $vgpr0, $vgpr31, $vgpr40, $vgpr41, \ 12: $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, \ 13: $sgpr10_sgpr11 14: $sgpr16 = COPY $sgpr33 15: $sgpr33 = frame-setup COPY $sgpr32 16: $sgpr18_sgpr19 = S_XOR_SAVEEXEC_B64 -1, \ 17: implicit-def $exec, implicit-def dead $scc, \ 18: implicit $exec 19: $vgpr63 = V_ACCVGPR_READ_B32_e64 killed $agpr13, \ 20: implicit $exec 21: BUFFER_STORE_DWORD_OFFSET killed $vgpr63, \ 22: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, \ 23: implicit $exec :: (store (s32) into %stack.38, \ 24: addrspace 5) 25: ... 26: $vgpr43 = IMPLICIT_DEF 27: $vgpr43 = SI_SPILL_S32_TO_VGPR $sgpr15, 0, \ 28: killed $vgpr43(tied-def 0) 29: $vgpr43 = SI_SPILL_S32_TO_VGPR $sgpr14, 1, \ 30: killed $vgpr43(tied-def 0) 31: $sgpr100_sgpr101 = S_OR_SAVEEXEC_B64 -1, \ 32: implicit-def $exec, implicit-def dead $scc, \ 33: implicit $exec 34: renamable $agpr13 = COPY killed $vgpr43, implicit $exec ``` 2. That instruction is created by emitCSRSpillStores (called by SIFrameLowering::emitPrologue) because $agpr13 is in WWMSpills. See lines 982, 998, and 993 below. ``` 977: // Spill Whole-Wave Mode VGPRs. Save only the inactive lanes of the scratch 978: // registers. However, save all lanes of callee-saved VGPRs. Due to this, we 979: // might end up flipping the EXEC bits twice. 980: Register ScratchExecCopy; 981: SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs; 982: FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs); 983: if (!WWMScratchRegs.empty()) 984: ScratchExecCopy = 985: buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, 986: /*IsProlog*/ true, /*EnableInactiveLanes*/ true); 987: 988: auto StoreWWMRegisters = 989: [&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) { 990: for (const auto &Reg : WWMRegs) { 991: Register VGPR = Reg.first; 992: int FI = Reg.second; 993: buildPrologSpill(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MBBI, DL, 994: VGPR, FI, FrameReg); 995: } 996: }; 997: 998: StoreWWMRegisters(WWMScratchRegs); ``` 3. $agpr13 got added to WWMSpills by SILowerWWMCopies::runOnMachineFunction as it processed the WWM_COPY on line 11 below (corresponds to line 34 above in point #1): ``` 1: %45:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr15, 0, %45:vgpr_32(tied-def 0) 2: %45:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr14, 1, %45:vgpr_32(tied-def 0) 3: %44:av_32 = WWM_COPY %45:vgpr_32 ```
1 parent 5846381 commit dd83334

File tree

2 files changed

+133
-0
lines changed

2 files changed

+133
-0
lines changed

llvm/lib/Target/AMDGPU/SIFrameLowering.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -983,6 +983,7 @@ void SIFrameLowering::emitCSRSpillStores(
983983
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
984984
const SIInstrInfo *TII = ST.getInstrInfo();
985985
const SIRegisterInfo &TRI = TII->getRegisterInfo();
986+
MachineRegisterInfo &MRI = MF.getRegInfo();
986987

987988
// Spill Whole-Wave Mode VGPRs. Save only the inactive lanes of the scratch
988989
// registers. However, save all lanes of callee-saved VGPRs. Due to this, we
@@ -1005,6 +1006,12 @@ void SIFrameLowering::emitCSRSpillStores(
10051006
}
10061007
};
10071008

1009+
for (const auto &Reg : WWMScratchRegs) {
1010+
if (!MRI.isReserved(Reg.first)) {
1011+
MRI.addLiveIn(Reg.first);
1012+
MBB.addLiveIn(Reg.first);
1013+
}
1014+
}
10081015
StoreWWMRegisters(WWMScratchRegs);
10091016

10101017
auto EnableAllLanes = [&]() {
Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
; Just ensure that llc -O1 does not error out
2+
; RUN: llc -O1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs %s -o - &>/dev/null
3+
4+
define fastcc void @widget(i1 %arg) {
5+
bb:
6+
br label %bb1
7+
8+
bb1: ; preds = %bb3, %bb
9+
br i1 %arg, label %bb3, label %bb2
10+
11+
bb2: ; preds = %bb1
12+
ret void
13+
14+
bb3: ; preds = %bb1
15+
%call = call fastcc i1 @baz(i1 false, float 0.000000e+00, i1 false, float 0.000000e+00, i1 false, i1 false, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, ptr addrspace(5) null, i1 false, ptr null, ptr null, ptr null, ptr null, ptr null, ptr addrspace(5) null, ptr null, ptr null, ptr null, ptr null, ptr null, ptr null, ptr null, ptr addrspace(5) null)
16+
br label %bb1
17+
}
18+
19+
define fastcc i1 @baz(i1 %arg, float %arg1, i1 %arg2, float %arg3, i1 %arg4, i1 %arg5, float %arg6, float %arg7, float %arg8, float %arg9, ptr addrspace(5) %arg10, i1 %arg11, ptr %arg12, ptr %arg13, ptr %arg14, ptr %arg15, ptr %arg16, ptr addrspace(5) %arg17, ptr %arg18, ptr %arg19, ptr %arg20, ptr %arg21, ptr %arg22, ptr %arg23, ptr %arg24, ptr addrspace(5) %arg25) #0 {
20+
bb:
21+
br i1 %arg, label %bb26, label %bb27
22+
23+
bb26: ; preds = %bb
24+
ret i1 false
25+
26+
bb27: ; preds = %bb
27+
br i1 %arg, label %bb29, label %bb28
28+
29+
bb28: ; preds = %bb27
30+
unreachable
31+
32+
bb29: ; preds = %bb49, %bb47, %bb46, %bb39, %bb36, %bb27
33+
br i1 %arg4, label %bb55, label %bb30
34+
35+
bb30: ; preds = %bb29
36+
br i1 %arg5, label %bb31, label %bb32
37+
38+
bb31: ; preds = %bb30
39+
store i1 false, ptr addrspace(5) null, align 2147483648
40+
br label %bb55
41+
42+
bb32: ; preds = %bb30
43+
store float %arg3, ptr addrspace(5) null, align 2147483648
44+
store float %arg7, ptr addrspace(5) %arg10, align 2147483648
45+
br i1 %arg2, label %bb34, label %bb33
46+
47+
bb33: ; preds = %bb32
48+
%fcmp = fcmp ogt float %arg6, 0.000000e+00
49+
br i1 %fcmp, label %bb34, label %bb35
50+
51+
bb34: ; preds = %bb33, %bb32
52+
br i1 %arg11, label %bb37, label %bb36
53+
54+
bb35: ; preds = %bb33
55+
store float 0.000000e+00, ptr addrspace(5) null, align 2147483648
56+
ret i1 false
57+
58+
bb36: ; preds = %bb34
59+
store i32 1, ptr addrspace(5) null, align 2147483648
60+
br label %bb29
61+
62+
bb37: ; preds = %bb34
63+
%load = load i8, ptr %arg12, align 2
64+
%trunc = trunc i8 %load to i1
65+
br i1 %trunc, label %bb38, label %bb54
66+
67+
bb38: ; preds = %bb37
68+
br i1 %arg4, label %bb39, label %bb53
69+
70+
bb39: ; preds = %bb38
71+
store float %arg1, ptr addrspace(5) null, align 2147483648
72+
%load40 = load float, ptr %arg15, align 8
73+
call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) %arg25, ptr %arg24, i64 12, i1 false)
74+
%load41 = load float, ptr %arg16, align 4
75+
call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) %arg17, ptr null, i64 36, i1 false)
76+
%load42 = load float, ptr %arg18, align 4
77+
%load43 = load float, ptr %arg19, align 4
78+
store float 0x7FF8000000000000, ptr addrspace(5) null, align 2147483648
79+
%load44 = load float, ptr %arg14, align 16
80+
store float %load44, ptr addrspace(5) null, align 2147483648
81+
%fcmp45 = fcmp ole float %arg9, 0.000000e+00
82+
br i1 %fcmp45, label %bb29, label %bb46
83+
84+
bb46: ; preds = %bb39
85+
%fsub = fsub float %arg8, %load40
86+
store float %fsub, ptr addrspace(5) null, align 2147483648
87+
%fadd = fadd float %load42, %load43
88+
br i1 %arg, label %bb29, label %bb47
89+
90+
bb47: ; preds = %bb46
91+
br i1 %arg, label %bb29, label %bb48
92+
93+
bb48: ; preds = %bb47
94+
br i1 %arg, label %bb49, label %bb52
95+
96+
bb49: ; preds = %bb48
97+
store float 0.000000e+00, ptr %arg23, align 4
98+
store float 0.000000e+00, ptr %arg22, align 8
99+
store float %fadd, ptr addrspace(5) null, align 2147483648
100+
%load50 = load float, ptr %arg20, align 4
101+
%fdiv = fdiv float %load41, %load50
102+
store float %fdiv, ptr addrspace(5) null, align 2147483648
103+
%load51 = load float, ptr %arg13, align 16
104+
store float %load51, ptr addrspace(5) null, align 2147483648
105+
store float 1.000000e+00, ptr %arg21, align 4
106+
br label %bb29
107+
108+
bb52: ; preds = %bb48
109+
unreachable
110+
111+
bb53: ; preds = %bb38
112+
ret i1 false
113+
114+
bb54: ; preds = %bb37
115+
ret i1 true
116+
117+
bb55: ; preds = %bb31, %bb29
118+
%load56 = load i1, ptr addrspace(5) null, align 2147483648
119+
ret i1 %load56
120+
}
121+
122+
; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
123+
declare void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noalias writeonly captures(none), ptr noalias readonly captures(none), i64, i1 immarg) #1
124+
125+
attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
126+
attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }

0 commit comments

Comments
 (0)