Skip to content

Commit befe537

Browse files
committed
[AMDGPU] Ignore inactive VGPRs in .vgpr_count
When using the amdgcn.init.whole.wave intrinsic, we add dummy VGPR arguments with the purpose of preserving their inactive lanes. The pattern may look something like this: ``` entry: call amdgcn.init.whole.wave branch to shader or tail shader: $vInactive = IMPLICIT_DEF ; Tells regalloc it's safe to use the active lanes actual code... tail: call amdgcn.cs.chain [...], implicit $vInactive ``` We should not report these VGPRs in the .vgpr_count metadata. This patch achieves that goal by ignoring IMPLICIT_DEFs and SI_TCRETURNs in functions that use the amdgcn.init.whole.wave intrinsic. It also simplifies the code in AMDGPUResourceUsageAnalysis to rely more on the TargetRegisterInfo for computing the number of used registers in the simple cases. This is a reworked version of llvm#133242, which was reverted in llvm#144039.
1 parent 408e550 commit befe537

10 files changed

+403
-262
lines changed

llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -991,7 +991,13 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
991991
// dispatch registers are function args.
992992
unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
993993

994-
if (isShader(F.getCallingConv())) {
994+
// Entry functions need to count input arguments even if they're not used
995+
// (i.e. not reported by AMDGPUResourceUsageAnalysis). Other functions can
996+
// skip including them. This is especially important for shaders that use the
997+
// init.whole.wave intrinsic, since they sometimes have VGPR arguments that
998+
// are only added for the purpose of preserving their inactive lanes and
999+
// should not be included in the vgpr-count.
1000+
if (isShader(F.getCallingConv()) && isEntryFunctionCC(F.getCallingConv())) {
9951001
bool IsPixelShader =
9961002
F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();
9971003

llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp

Lines changed: 44 additions & 255 deletions
Original file line numberDiff line numberDiff line change
@@ -139,268 +139,56 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
139139

140140
Info.UsesVCC =
141141
MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);
142+
Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass,
143+
/*IncludeCalls=*/false);
144+
if (ST.hasMAIInsts())
145+
Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass,
146+
/*IncludeCalls=*/false);
142147

143-
// If there are no calls, MachineRegisterInfo can tell us the used register
144-
// count easily.
145148
// A tail call isn't considered a call for MachineFrameInfo's purposes.
146-
if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
147-
Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass);
148-
Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass);
149-
if (ST.hasMAIInsts())
150-
Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass);
149+
bool HasCalls = FrameInfo.hasCalls() || FrameInfo.hasTailCall();
150+
// Functions that use the llvm.amdgcn.init.whole.wave intrinsic often have
151+
// VGPR arguments that are only added for the purpose of preserving the
152+
// inactive lanes. These should not be included in the number of used VGPRs.
153+
bool NeedsExplicitVGPRCount = MFI->hasInitWholeWave();
154+
if (!HasCalls && !NeedsExplicitVGPRCount) {
155+
156+
Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass,
157+
/*IncludeCalls=*/false);
151158
return Info;
152159
}
153160

154161
int32_t MaxVGPR = -1;
155-
int32_t MaxAGPR = -1;
156-
int32_t MaxSGPR = -1;
157162
Info.CalleeSegmentSize = 0;
158163

159164
for (const MachineBasicBlock &MBB : MF) {
160165
for (const MachineInstr &MI : MBB) {
161-
// TODO: Check regmasks? Do they occur anywhere except calls?
162-
for (const MachineOperand &MO : MI.operands()) {
163-
unsigned Width = 0;
164-
bool IsSGPR = false;
165-
bool IsAGPR = false;
166-
167-
if (!MO.isReg())
168-
continue;
169-
170-
Register Reg = MO.getReg();
171-
switch (Reg) {
172-
case AMDGPU::EXEC:
173-
case AMDGPU::EXEC_LO:
174-
case AMDGPU::EXEC_HI:
175-
case AMDGPU::SCC:
176-
case AMDGPU::M0:
177-
case AMDGPU::M0_LO16:
178-
case AMDGPU::M0_HI16:
179-
case AMDGPU::SRC_SHARED_BASE_LO:
180-
case AMDGPU::SRC_SHARED_BASE:
181-
case AMDGPU::SRC_SHARED_LIMIT_LO:
182-
case AMDGPU::SRC_SHARED_LIMIT:
183-
case AMDGPU::SRC_PRIVATE_BASE_LO:
184-
case AMDGPU::SRC_PRIVATE_BASE:
185-
case AMDGPU::SRC_PRIVATE_LIMIT_LO:
186-
case AMDGPU::SRC_PRIVATE_LIMIT:
187-
case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
188-
case AMDGPU::SGPR_NULL:
189-
case AMDGPU::SGPR_NULL64:
190-
case AMDGPU::MODE:
191-
continue;
192-
193-
case AMDGPU::NoRegister:
194-
assert(MI.isDebugInstr() &&
195-
"Instruction uses invalid noreg register");
196-
continue;
197-
198-
case AMDGPU::VCC:
199-
case AMDGPU::VCC_LO:
200-
case AMDGPU::VCC_HI:
201-
case AMDGPU::VCC_LO_LO16:
202-
case AMDGPU::VCC_LO_HI16:
203-
case AMDGPU::VCC_HI_LO16:
204-
case AMDGPU::VCC_HI_HI16:
205-
Info.UsesVCC = true;
206-
continue;
207-
208-
case AMDGPU::FLAT_SCR:
209-
case AMDGPU::FLAT_SCR_LO:
210-
case AMDGPU::FLAT_SCR_HI:
211-
continue;
212-
213-
case AMDGPU::XNACK_MASK:
214-
case AMDGPU::XNACK_MASK_LO:
215-
case AMDGPU::XNACK_MASK_HI:
216-
llvm_unreachable("xnack_mask registers should not be used");
217-
218-
case AMDGPU::LDS_DIRECT:
219-
llvm_unreachable("lds_direct register should not be used");
220-
221-
case AMDGPU::TBA:
222-
case AMDGPU::TBA_LO:
223-
case AMDGPU::TBA_HI:
224-
case AMDGPU::TMA:
225-
case AMDGPU::TMA_LO:
226-
case AMDGPU::TMA_HI:
227-
llvm_unreachable("trap handler registers should not be used");
228-
229-
case AMDGPU::SRC_VCCZ:
230-
llvm_unreachable("src_vccz register should not be used");
231-
232-
case AMDGPU::SRC_EXECZ:
233-
llvm_unreachable("src_execz register should not be used");
234-
235-
case AMDGPU::SRC_SCC:
236-
llvm_unreachable("src_scc register should not be used");
237-
238-
default:
239-
break;
240-
}
241-
242-
if (AMDGPU::SGPR_32RegClass.contains(Reg) ||
243-
AMDGPU::SGPR_LO16RegClass.contains(Reg) ||
244-
AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
245-
IsSGPR = true;
246-
Width = 1;
247-
} else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
248-
AMDGPU::VGPR_16RegClass.contains(Reg)) {
249-
IsSGPR = false;
250-
Width = 1;
251-
} else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
252-
AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
253-
IsSGPR = false;
254-
IsAGPR = true;
255-
Width = 1;
256-
} else if (AMDGPU::SGPR_64RegClass.contains(Reg)) {
257-
IsSGPR = true;
258-
Width = 2;
259-
} else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
260-
IsSGPR = false;
261-
Width = 2;
262-
} else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
263-
IsSGPR = false;
264-
IsAGPR = true;
265-
Width = 2;
266-
} else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
267-
IsSGPR = false;
268-
Width = 3;
269-
} else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
270-
IsSGPR = true;
271-
Width = 3;
272-
} else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
273-
IsSGPR = false;
274-
IsAGPR = true;
275-
Width = 3;
276-
} else if (AMDGPU::SGPR_128RegClass.contains(Reg)) {
277-
IsSGPR = true;
278-
Width = 4;
279-
} else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
280-
IsSGPR = false;
281-
Width = 4;
282-
} else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
283-
IsSGPR = false;
284-
IsAGPR = true;
285-
Width = 4;
286-
} else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
287-
IsSGPR = false;
288-
Width = 5;
289-
} else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
290-
IsSGPR = true;
291-
Width = 5;
292-
} else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
293-
IsSGPR = false;
294-
IsAGPR = true;
295-
Width = 5;
296-
} else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
297-
IsSGPR = false;
298-
Width = 6;
299-
} else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
300-
IsSGPR = true;
301-
Width = 6;
302-
} else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
303-
IsSGPR = false;
304-
IsAGPR = true;
305-
Width = 6;
306-
} else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
307-
IsSGPR = false;
308-
Width = 7;
309-
} else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
310-
IsSGPR = true;
311-
Width = 7;
312-
} else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
313-
IsSGPR = false;
314-
IsAGPR = true;
315-
Width = 7;
316-
} else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
317-
IsSGPR = true;
318-
Width = 8;
319-
} else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
320-
IsSGPR = false;
321-
Width = 8;
322-
} else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
323-
IsSGPR = false;
324-
IsAGPR = true;
325-
Width = 8;
326-
} else if (AMDGPU::VReg_288RegClass.contains(Reg)) {
327-
IsSGPR = false;
328-
Width = 9;
329-
} else if (AMDGPU::SReg_288RegClass.contains(Reg)) {
330-
IsSGPR = true;
331-
Width = 9;
332-
} else if (AMDGPU::AReg_288RegClass.contains(Reg)) {
333-
IsSGPR = false;
334-
IsAGPR = true;
335-
Width = 9;
336-
} else if (AMDGPU::VReg_320RegClass.contains(Reg)) {
337-
IsSGPR = false;
338-
Width = 10;
339-
} else if (AMDGPU::SReg_320RegClass.contains(Reg)) {
340-
IsSGPR = true;
341-
Width = 10;
342-
} else if (AMDGPU::AReg_320RegClass.contains(Reg)) {
343-
IsSGPR = false;
344-
IsAGPR = true;
345-
Width = 10;
346-
} else if (AMDGPU::VReg_352RegClass.contains(Reg)) {
347-
IsSGPR = false;
348-
Width = 11;
349-
} else if (AMDGPU::SReg_352RegClass.contains(Reg)) {
350-
IsSGPR = true;
351-
Width = 11;
352-
} else if (AMDGPU::AReg_352RegClass.contains(Reg)) {
353-
IsSGPR = false;
354-
IsAGPR = true;
355-
Width = 11;
356-
} else if (AMDGPU::VReg_384RegClass.contains(Reg)) {
357-
IsSGPR = false;
358-
Width = 12;
359-
} else if (AMDGPU::SReg_384RegClass.contains(Reg)) {
360-
IsSGPR = true;
361-
Width = 12;
362-
} else if (AMDGPU::AReg_384RegClass.contains(Reg)) {
363-
IsSGPR = false;
364-
IsAGPR = true;
365-
Width = 12;
366-
} else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
367-
IsSGPR = true;
368-
Width = 16;
369-
} else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
370-
IsSGPR = false;
371-
Width = 16;
372-
} else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
373-
IsSGPR = false;
374-
IsAGPR = true;
375-
Width = 16;
376-
} else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
377-
IsSGPR = true;
378-
Width = 32;
379-
} else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
380-
IsSGPR = false;
381-
Width = 32;
382-
} else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
383-
IsSGPR = false;
384-
IsAGPR = true;
385-
Width = 32;
386-
} else {
387-
// We only expect TTMP registers or registers that do not belong to
388-
// any RC.
389-
assert((AMDGPU::TTMP_32RegClass.contains(Reg) ||
390-
AMDGPU::TTMP_64RegClass.contains(Reg) ||
391-
AMDGPU::TTMP_128RegClass.contains(Reg) ||
392-
AMDGPU::TTMP_256RegClass.contains(Reg) ||
393-
AMDGPU::TTMP_512RegClass.contains(Reg) ||
394-
!TRI.getPhysRegBaseClass(Reg)) &&
395-
"Unknown register class");
396-
}
397-
unsigned HWReg = TRI.getHWRegIndex(Reg);
398-
int MaxUsed = HWReg + Width - 1;
399-
if (IsSGPR) {
400-
MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
401-
} else if (IsAGPR) {
402-
MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
403-
} else {
166+
if (NeedsExplicitVGPRCount) {
167+
for (unsigned i = 0; i < MI.getNumOperands(); ++i) {
168+
const MachineOperand &MO = MI.getOperand(i);
169+
170+
if (!MO.isReg())
171+
continue;
172+
Register Reg = MO.getReg();
173+
const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(Reg);
174+
175+
if (!RC || !TRI.isVGPRClass(RC))
176+
continue;
177+
178+
// Skip inactive VGPRs in chain functions with the init.whole.wave
179+
// intrinsic. These will only appear as implicit use operands on the
180+
// chain call, and as the def of an IMPLICIT_DEF. We're going to skip
181+
// implicit defs unconditionally though because if they're important
182+
// in a different context then they will be counted when they are
183+
// used.
184+
bool IsChainCall =
185+
MFI->isChainFunction() && MI.getOpcode() == AMDGPU::SI_TCRETURN;
186+
if (IsChainCall || MI.isImplicitDef())
187+
continue;
188+
189+
unsigned Width = TRI.getRegSizeInBits(*RC) / 32;
190+
unsigned HWReg = TRI.getHWRegIndex(Reg);
191+
int MaxUsed = HWReg + Width - 1;
404192
MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
405193
}
406194
}
@@ -464,9 +252,10 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
464252
}
465253
}
466254

467-
Info.NumExplicitSGPR = MaxSGPR + 1;
468-
Info.NumVGPR = MaxVGPR + 1;
469-
Info.NumAGPR = MaxAGPR + 1;
255+
if (NeedsExplicitVGPRCount)
256+
Info.NumVGPR = MaxVGPR + 1;
257+
else
258+
Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass, false);
470259

471260
return Info;
472261
}

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4046,11 +4046,11 @@ SIRegisterInfo::getSubRegAlignmentNumBits(const TargetRegisterClass *RC,
40464046
return 0;
40474047
}
40484048

4049-
unsigned
4050-
SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
4051-
const TargetRegisterClass &RC) const {
4049+
unsigned SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
4050+
const TargetRegisterClass &RC,
4051+
bool IncludeCalls) const {
40524052
for (MCPhysReg Reg : reverse(RC.getRegisters()))
4053-
if (MRI.isPhysRegUsed(Reg))
4053+
if (MRI.isPhysRegUsed(Reg, /*SkipRegMaskTest=*/!IncludeCalls))
40544054
return getHWRegIndex(Reg) + 1;
40554055
return 0;
40564056
}

llvm/lib/Target/AMDGPU/SIRegisterInfo.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -482,9 +482,11 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
482482
unsigned SubReg) const;
483483

484484
// \returns a number of registers of a given \p RC used in a function.
485-
// Does not go inside function calls.
485+
// Does not go inside function calls. If \p IncludeCalls is true, it will
486+
// include registers that may be clobbered by calls.
486487
unsigned getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
487-
const TargetRegisterClass &RC) const;
488+
const TargetRegisterClass &RC,
489+
bool IncludeCalls = true) const;
488490

489491
std::optional<uint8_t> getVRegFlagValue(StringRef Name) const override {
490492
return Name == "WWM_REG" ? AMDGPU::VirtRegFlag::WWM_REG

0 commit comments

Comments
 (0)