Skip to content

Commit a910a6a

Browse files
authored
[AMDGPU] AsmPrinter: Unify arg handling (#151672)
When computing the number of registers required by entry functions, the `AMDGPUAsmPrinter` needs to take into account both the register usage computed by the `AMDGPUResourceUsageAnalysis` pass, and the number of registers initialized by the hardware. At the moment, the way it computes the latter is different for graphics vs compute, due to differences in the implementation. For kernels, all the information needed is available in the `SIMachineFunctionInfo`, but for graphics shaders we would iterate over the `Function` arguments in the `AMDGPUAsmPrinter`. This pretty much repeats some of the logic from instruction selection. This patch introduces 2 new members to `SIMachineFunctionInfo`, one for SGPRs and one for VGPRs. Both will be computed during instruction selection and then used during `AMDGPUAsmPrinter`, removing the need to refer to the `Function` when printing assembly. This patch is NFC except for the fact that we now add the extra SGPRs (VCC, XNACK etc) to the number of SGPRs computed for graphics entry points. I'm not sure why these weren't included before. It would be nice if someone could confirm if that was just an oversight or if we have some docs somewhere that I haven't managed to find. Only one test is affected (its SGPR usage increases because we now take into account the XNACK registers).
1 parent de72cca commit a910a6a

14 files changed

+125
-81
lines changed

llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp

Lines changed: 10 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -997,89 +997,24 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
997997
const Function &F = MF.getFunction();
998998

999999
// Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
1000-
// dispatch registers are function args.
1001-
unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
1002-
1003-
if (isShader(F.getCallingConv())) {
1004-
bool IsPixelShader =
1005-
F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();
1006-
1007-
// Calculate the number of VGPR registers based on the SPI input registers
1008-
uint32_t InputEna = 0;
1009-
uint32_t InputAddr = 0;
1010-
unsigned LastEna = 0;
1011-
1012-
if (IsPixelShader) {
1013-
// Note for IsPixelShader:
1014-
// By this stage, all enabled inputs are tagged in InputAddr as well.
1015-
// We will use InputAddr to determine whether the input counts against the
1016-
// vgpr total and only use the InputEnable to determine the last input
1017-
// that is relevant - if extra arguments are used, then we have to honour
1018-
// the InputAddr for any intermediate non-enabled inputs.
1019-
InputEna = MFI->getPSInputEnable();
1020-
InputAddr = MFI->getPSInputAddr();
1021-
1022-
// We only need to consider input args up to the last used arg.
1023-
assert((InputEna || InputAddr) &&
1024-
"PSInputAddr and PSInputEnable should "
1025-
"never both be 0 for AMDGPU_PS shaders");
1026-
// There are some rare circumstances where InputAddr is non-zero and
1027-
// InputEna can be set to 0. In this case we default to setting LastEna
1028-
// to 1.
1029-
LastEna = InputEna ? llvm::Log2_32(InputEna) + 1 : 1;
1030-
}
1000+
// dispatch registers as function args.
1001+
unsigned WaveDispatchNumSGPR = MFI->getNumWaveDispatchSGPRs(),
1002+
WaveDispatchNumVGPR = MFI->getNumWaveDispatchVGPRs();
10311003

1032-
// FIXME: We should be using the number of registers determined during
1033-
// calling convention lowering to legalize the types.
1034-
const DataLayout &DL = F.getDataLayout();
1035-
unsigned PSArgCount = 0;
1036-
unsigned IntermediateVGPR = 0;
1037-
for (auto &Arg : F.args()) {
1038-
unsigned NumRegs = (DL.getTypeSizeInBits(Arg.getType()) + 31) / 32;
1039-
if (Arg.hasAttribute(Attribute::InReg)) {
1040-
WaveDispatchNumSGPR += NumRegs;
1041-
} else {
1042-
// If this is a PS shader and we're processing the PS Input args (first
1043-
// 16 VGPR), use the InputEna and InputAddr bits to define how many
1044-
// VGPRs are actually used.
1045-
// Any extra VGPR arguments are handled as normal arguments (and
1046-
// contribute to the VGPR count whether they're used or not).
1047-
if (IsPixelShader && PSArgCount < 16) {
1048-
if ((1 << PSArgCount) & InputAddr) {
1049-
if (PSArgCount < LastEna)
1050-
WaveDispatchNumVGPR += NumRegs;
1051-
else
1052-
IntermediateVGPR += NumRegs;
1053-
}
1054-
PSArgCount++;
1055-
} else {
1056-
// If there are extra arguments we have to include the allocation for
1057-
// the non-used (but enabled with InputAddr) input arguments
1058-
if (IntermediateVGPR) {
1059-
WaveDispatchNumVGPR += IntermediateVGPR;
1060-
IntermediateVGPR = 0;
1061-
}
1062-
WaveDispatchNumVGPR += NumRegs;
1063-
}
1064-
}
1065-
}
1004+
if (WaveDispatchNumSGPR) {
10661005
ProgInfo.NumSGPR = AMDGPUMCExpr::createMax(
1067-
{ProgInfo.NumSGPR, CreateExpr(WaveDispatchNumSGPR)}, Ctx);
1006+
{ProgInfo.NumSGPR,
1007+
MCBinaryExpr::createAdd(CreateExpr(WaveDispatchNumSGPR), ExtraSGPRs,
1008+
Ctx)},
1009+
Ctx);
1010+
}
10681011

1012+
if (WaveDispatchNumVGPR) {
10691013
ProgInfo.NumArchVGPR = AMDGPUMCExpr::createMax(
10701014
{ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx);
10711015

10721016
ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
10731017
ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
1074-
} else if (isKernel(F.getCallingConv()) &&
1075-
MFI->getNumKernargPreloadedSGPRs()) {
1076-
// Consider cases where the total number of UserSGPRs with trailing
1077-
// allocated preload SGPRs, is greater than the number of explicitly
1078-
// referenced SGPRs.
1079-
const MCExpr *UserPlusExtraSGPRs = MCBinaryExpr::createAdd(
1080-
CreateExpr(MFI->getNumUserSGPRs()), ExtraSGPRs, Ctx);
1081-
ProgInfo.NumSGPR =
1082-
AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, UserPlusExtraSGPRs}, Ctx);
10831018
}
10841019

10851020
// Adjust number of registers used to meet default/requested minimum/maximum

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -580,6 +580,9 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
580580
++i;
581581
}
582582

583+
if (Info->getNumKernargPreloadedSGPRs())
584+
Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
585+
583586
TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
584587
TLI.allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false);
585588
return true;
@@ -743,6 +746,15 @@ bool AMDGPUCallLowering::lowerFormalArguments(
743746
if (!determineAssignments(Assigner, SplitArgs, CCInfo))
744747
return false;
745748

749+
if (IsEntryFunc) {
750+
// This assumes the registers are allocated by CCInfo in ascending order
751+
// with no gaps.
752+
Info->setNumWaveDispatchSGPRs(
753+
CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters()));
754+
Info->setNumWaveDispatchVGPRs(
755+
CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters()));
756+
}
757+
746758
FormalArgHandler Handler(B, MRI);
747759
if (!handleAssignments(Handler, SplitArgs, CCInfo, ArgLocs, B))
748760
return false;

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3106,6 +3106,15 @@ SDValue SITargetLowering::LowerFormalArguments(
31063106
if (!IsKernel) {
31073107
CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
31083108
CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
3109+
3110+
// This assumes the registers are allocated by CCInfo in ascending order
3111+
// with no gaps.
3112+
Info->setNumWaveDispatchSGPRs(
3113+
CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters()));
3114+
Info->setNumWaveDispatchVGPRs(
3115+
CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters()));
3116+
} else if (Info->getNumKernargPreloadedSGPRs()) {
3117+
Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
31093118
}
31103119

31113120
SmallVector<SDValue, 16> Chains;

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -728,6 +728,8 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
728728
MemoryBound(MFI.isMemoryBound()), WaveLimiter(MFI.needsWaveLimiter()),
729729
HasSpilledSGPRs(MFI.hasSpilledSGPRs()),
730730
HasSpilledVGPRs(MFI.hasSpilledVGPRs()),
731+
NumWaveDispatchSGPRs(MFI.getNumWaveDispatchSGPRs()),
732+
NumWaveDispatchVGPRs(MFI.getNumWaveDispatchVGPRs()),
731733
HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()),
732734
Occupancy(MFI.getOccupancy()),
733735
ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)),
@@ -784,6 +786,8 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
784786
WaveLimiter = YamlMFI.WaveLimiter;
785787
HasSpilledSGPRs = YamlMFI.HasSpilledSGPRs;
786788
HasSpilledVGPRs = YamlMFI.HasSpilledVGPRs;
789+
NumWaveDispatchSGPRs = YamlMFI.NumWaveDispatchSGPRs;
790+
NumWaveDispatchVGPRs = YamlMFI.NumWaveDispatchVGPRs;
787791
BytesInStackArgArea = YamlMFI.BytesInStackArgArea;
788792
ReturnsVoid = YamlMFI.ReturnsVoid;
789793
IsWholeWaveFunction = YamlMFI.IsWholeWaveFunction;

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,8 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
270270
bool WaveLimiter = false;
271271
bool HasSpilledSGPRs = false;
272272
bool HasSpilledVGPRs = false;
273+
uint16_t NumWaveDispatchSGPRs = 0;
274+
uint16_t NumWaveDispatchVGPRs = 0;
273275
uint32_t HighBitsOf32BitAddress = 0;
274276

275277
// TODO: 10 may be a better default since it's the maximum.
@@ -327,6 +329,8 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
327329
YamlIO.mapOptional("waveLimiter", MFI.WaveLimiter, false);
328330
YamlIO.mapOptional("hasSpilledSGPRs", MFI.HasSpilledSGPRs, false);
329331
YamlIO.mapOptional("hasSpilledVGPRs", MFI.HasSpilledVGPRs, false);
332+
YamlIO.mapOptional("numWaveDispatchSGPRs", MFI.NumWaveDispatchSGPRs, false);
333+
YamlIO.mapOptional("numWaveDispatchVGPRs", MFI.NumWaveDispatchVGPRs, false);
330334
YamlIO.mapOptional("scratchRSrcReg", MFI.ScratchRSrcReg,
331335
StringValue("$private_rsrc_reg"));
332336
YamlIO.mapOptional("frameOffsetReg", MFI.FrameOffsetReg,
@@ -465,6 +469,9 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
465469
unsigned NumUserSGPRs = 0;
466470
unsigned NumSystemSGPRs = 0;
467471

472+
unsigned NumWaveDispatchSGPRs = 0;
473+
unsigned NumWaveDispatchVGPRs = 0;
474+
468475
bool HasSpilledSGPRs = false;
469476
bool HasSpilledVGPRs = false;
470477
bool HasNonSpillStackObjects = false;
@@ -991,6 +998,14 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
991998
return UserSGPRInfo.getNumKernargPreloadSGPRs();
992999
}
9931000

1001+
unsigned getNumWaveDispatchSGPRs() const { return NumWaveDispatchSGPRs; }
1002+
1003+
void setNumWaveDispatchSGPRs(unsigned Count) { NumWaveDispatchSGPRs = Count; }
1004+
1005+
unsigned getNumWaveDispatchVGPRs() const { return NumWaveDispatchVGPRs; }
1006+
1007+
void setNumWaveDispatchVGPRs(unsigned Count) { NumWaveDispatchVGPRs = Count; }
1008+
9941009
Register getPrivateSegmentWaveByteOffsetSystemSGPR() const {
9951010
return ArgInfo.PrivateSegmentWaveByteOffset.getRegister();
9961011
}

llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
1-
;RUN: llc < %s -mtriple=amdgcn-pal -mcpu=gfx1010 | FileCheck %s --check-prefixes=CHECK
2-
;RUN: llc < %s -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 | FileCheck %s --check-prefixes=CHECK
1+
;RUN: llc -global-isel=1 < %s -mtriple=amdgcn-pal -mcpu=gfx1010 | FileCheck %s --check-prefixes=CHECK
2+
;RUN: llc -global-isel=1 < %s -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 | FileCheck %s --check-prefixes=CHECK
3+
;RUN: llc -global-isel=0 < %s -mtriple=amdgcn-pal -mcpu=gfx1010 | FileCheck %s --check-prefixes=CHECK
4+
;RUN: llc -global-isel=0 < %s -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 | FileCheck %s --check-prefixes=CHECK
35

46
; ;CHECK-LABEL: {{^}}_amdgpu_ps_1_arg:
57
; ;CHECK: NumVgprs: 4
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=CHECK,PACKED16
2+
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck %s --check-prefixes=CHECK,SPLIT16
3+
4+
@global = addrspace(1) global i32 poison, align 4
5+
6+
; The hardware initializes the registers received as arguments by entry points,
7+
; so they will be counted even if unused.
8+
9+
; Vectors of i1 are always unpacked
10+
11+
; CHECK-LABEL: vec_of_i1:
12+
; CHECK: TotalNumSgprs: 8
13+
define amdgpu_ps void @vec_of_i1(<8 x i1> inreg %v8i1) {
14+
ret void
15+
}
16+
17+
; Vectors of i8 are always unpacked
18+
19+
; CHECK-LABEL: vec_of_i8:
20+
; CHECK: TotalNumSgprs: 4
21+
define amdgpu_ps void @vec_of_i8(<4 x i8> inreg %v4i8) {
22+
ret void
23+
}
24+
25+
; Vectors of 16-bit types are packed for newer architectures and unpacked for older ones.
26+
27+
; CHECK-LABEL: vec_of_16_bit_ty:
28+
; PACKED16: TotalNumSgprs: 3
29+
; SPLIT16: TotalNumSgprs: 6
30+
define amdgpu_ps void @vec_of_16_bit_ty(<2 x i16> inreg %v2i16, <4 x half> inreg %v4half) {
31+
ret void
32+
}
33+
34+
; CHECK-LABEL: buffer_fat_ptr:
35+
; CHECK: TotalNumSgprs: 5
36+
define amdgpu_ps void @buffer_fat_ptr(ptr addrspace(7) inreg %p) {
37+
ret void
38+
}

llvm/test/CodeGen/AMDGPU/wave_dispatch_regs.ll

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
1-
; RUN: llc -mtriple=amdgcn--amdpal < %s | FileCheck -check-prefix=GCN -check-prefix=SI -enable-var-scope %s
2-
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=VI -enable-var-scope %s
3-
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -enable-var-scope %s
1+
; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal < %s | FileCheck -check-prefix=GCN -check-prefix=SI -enable-var-scope %s
2+
; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=VI -enable-var-scope %s
3+
; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -enable-var-scope %s
4+
; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal < %s | FileCheck -check-prefix=GCN -check-prefix=SI -enable-var-scope %s
5+
; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=VI -enable-var-scope %s
6+
; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -enable-var-scope %s
47

58
; This compute shader has input args that claim that it has 17 sgprs and 5 vgprs
69
; in wave dispatch. Ensure that the sgpr and vgpr counts in COMPUTE_PGM_RSRC1
@@ -17,7 +20,7 @@
1720
; GCN-NEXT: .scratch_memory_size: 0
1821
; SI-NEXT: .sgpr_count: 0x11
1922
; VI-NEXT: .sgpr_count: 0x60
20-
; GFX9-NEXT: .sgpr_count: 0x11
23+
; GFX9-NEXT: .sgpr_count: 0x15
2124
; SI-NEXT: .vgpr_count: 0x5
2225
; VI-NEXT: .vgpr_count: 0x5
2326
; GFX9-NEXT: .vgpr_count: 0x5

llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
; CHECK-NEXT: waveLimiter: false
1818
; CHECK-NEXT: hasSpilledSGPRs: false
1919
; CHECK-NEXT: hasSpilledVGPRs: false
20+
; CHECK-NEXT: numWaveDispatchSGPRs: 0
21+
; CHECK-NEXT: numWaveDispatchVGPRs: 0
2022
; CHECK-NEXT: scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99'
2123
; CHECK-NEXT: frameOffsetReg: '$fp_reg'
2224
; CHECK-NEXT: stackPtrOffsetReg: '$sgpr32'
@@ -287,6 +289,8 @@
287289
; CHECK-NEXT: waveLimiter: false
288290
; CHECK-NEXT: hasSpilledSGPRs: false
289291
; CHECK-NEXT: hasSpilledVGPRs: false
292+
; CHECK-NEXT: numWaveDispatchSGPRs: 0
293+
; CHECK-NEXT: numWaveDispatchVGPRs: 0
290294
; CHECK-NEXT: scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99'
291295
; CHECK-NEXT: frameOffsetReg: '$fp_reg'
292296
; CHECK-NEXT: stackPtrOffsetReg: '$sgpr32'

llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
; AFTER-PEI-NEXT: waveLimiter: false
1717
; AFTER-PEI-NEXT: hasSpilledSGPRs: true
1818
; AFTER-PEI-NEXT: hasSpilledVGPRs: false
19+
; AFTER-PEI-NEXT: numWaveDispatchSGPRs: 0
20+
; AFTER-PEI-NEXT: numWaveDispatchVGPRs: 0
1921
; AFTER-PEI-NEXT: scratchRSrcReg: '$sgpr68_sgpr69_sgpr70_sgpr71'
2022
; AFTER-PEI-NEXT: frameOffsetReg: '$fp_reg'
2123
; AFTER-PEI-NEXT: stackPtrOffsetReg: '$sgpr32'

0 commit comments

Comments
 (0)