Skip to content

Commit 476a6ea

Browse files
authored
AMDGPU: Track minNumAGPRs in MFI instead of mayUseAGPRs (#161996)
Fix mfma agpr allocation failures with -O0. Previously we were getting lucky on cases that can use AV registers with the normal optimization pipeline. This logic needs to be consistent with getMaxNumVectorRegs, as that is what getReservedRegs to determine the AGPR budget. In the future we should directly check the minimum AGPR budget, and individual selection patterns need to know the minimum budget required for them. Start accounting for the number of AGPRs required to perform the allocation. Refine the selection predicates to check this number is available, and default to selecting the VGPR case if there aren't enough. This also avoids register allocation failures for the largest MFMAs with the default register budget.
1 parent 709980e commit 476a6ea

File tree

7 files changed

+383
-56
lines changed

7 files changed

+383
-56
lines changed

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5043,6 +5043,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
50435043
case Intrinsic::amdgcn_mfma_i32_16x16x64_i8:
50445044
case Intrinsic::amdgcn_mfma_i32_32x32x32_i8:
50455045
case Intrinsic::amdgcn_mfma_f32_16x16x32_bf16: {
5046+
unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
5047+
unsigned MinNumRegsRequired = DstSize / 32;
5048+
50465049
// Default for MAI intrinsics.
50475050
// srcC can also be an immediate which can be folded later.
50485051
// FIXME: Should we eventually add an alternative mapping with AGPR src
@@ -5051,29 +5054,32 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
50515054
// vdst, srcA, srcB, srcC
50525055
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
50535056
OpdsMapping[0] =
5054-
Info->mayNeedAGPRs()
5057+
Info->getMinNumAGPRs() >= MinNumRegsRequired
50555058
? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
50565059
: getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
50575060
OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
50585061
OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
50595062
OpdsMapping[4] =
5060-
Info->mayNeedAGPRs()
5063+
Info->getMinNumAGPRs() >= MinNumRegsRequired
50615064
? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
50625065
: getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
50635066
break;
50645067
}
50655068
case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
50665069
case Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4: {
5070+
unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
5071+
unsigned MinNumRegsRequired = DstSize / 32;
5072+
50675073
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
50685074
OpdsMapping[0] =
5069-
Info->mayNeedAGPRs()
5075+
Info->getMinNumAGPRs() >= MinNumRegsRequired
50705076
? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
50715077
: getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
50725078

50735079
OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
50745080
OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
50755081
OpdsMapping[4] =
5076-
Info->mayNeedAGPRs()
5082+
Info->getMinNumAGPRs() >= MinNumRegsRequired
50775083
? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
50785084
: getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
50795085

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17357,7 +17357,8 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
1735717357
// use between vgpr and agpr as agpr tuples tend to be big.
1735817358
if (!MI.getDesc().operands().empty()) {
1735917359
unsigned Opc = MI.getOpcode();
17360-
bool HasAGPRs = Info->mayNeedAGPRs();
17360+
bool HasAGPRs =
17361+
!Subtarget->hasGFX90AInsts() || Info->getMinNumAGPRs() != 0;
1736117362
const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
1736217363
int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1736317364
for (auto I :

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -33,17 +33,20 @@ using namespace llvm;
3333
// optimal RC for Opc and Dest of MFMA. In particular, there are high RP cases
3434
// where it is better to produce the VGPR form (e.g. if there are VGPR users
3535
// of the MFMA result).
36-
static cl::opt<bool> MFMAVGPRForm(
37-
"amdgpu-mfma-vgpr-form", cl::Hidden,
36+
static cl::opt<bool, true> MFMAVGPRFormOpt(
37+
"amdgpu-mfma-vgpr-form",
3838
cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If "
3939
"unspecified, default to compiler heuristics"),
40-
cl::init(false));
40+
cl::location(SIMachineFunctionInfo::MFMAVGPRForm), cl::init(false),
41+
cl::Hidden);
4142

4243
const GCNTargetMachine &getTM(const GCNSubtarget *STI) {
4344
const SITargetLowering *TLI = STI->getTargetLowering();
4445
return static_cast<const GCNTargetMachine &>(TLI->getTargetMachine());
4546
}
4647

48+
bool SIMachineFunctionInfo::MFMAVGPRForm = false;
49+
4750
SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
4851
const GCNSubtarget *STI)
4952
: AMDGPUMachineFunction(F, *STI), Mode(F, *STI), GWSResourcePSV(getTM(STI)),
@@ -81,14 +84,13 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
8184
PSInputAddr = AMDGPU::getInitialPSInputAddr(F);
8285
}
8386

84-
MayNeedAGPRs = ST.hasMAIInsts();
8587
if (ST.hasGFX90AInsts()) {
86-
// FIXME: MayNeedAGPRs is a misnomer for how this is used. MFMA selection
87-
// should be separated from availability of AGPRs
88-
if (MFMAVGPRForm ||
89-
(ST.getMaxNumVGPRs(F) <= ST.getAddressableNumArchVGPRs() &&
90-
!mayUseAGPRs(F)))
91-
MayNeedAGPRs = false; // We will select all MAI with VGPR operands.
88+
// FIXME: Extract logic out of getMaxNumVectorRegs; we need to apply the
89+
// allocation granule and clamping.
90+
auto [MinNumAGPRAttr, MaxNumAGPRAttr] =
91+
AMDGPU::getIntegerPairAttribute(F, "amdgpu-agpr-alloc", {~0u, ~0u},
92+
/*OnlyFirstRequired=*/true);
93+
MinNumAGPRs = MinNumAGPRAttr;
9294
}
9395

9496
if (AMDGPU::isChainCC(CC)) {

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -509,7 +509,9 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
509509
// user arguments. This is an offset from the KernargSegmentPtr.
510510
bool ImplicitArgPtr : 1;
511511

512-
bool MayNeedAGPRs : 1;
512+
/// Minimum number of AGPRs required to allocate in the function. Only
513+
/// relevant for gfx90a-gfx950. For gfx908, this should be infinite.
514+
unsigned MinNumAGPRs = ~0u;
513515

514516
// The hard-wired high half of the address of the global information table
515517
// for AMDPAL OS type. 0xffffffff represents no hard-wired high half, since
@@ -537,6 +539,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
537539
void MRI_NoteCloneVirtualRegister(Register NewReg, Register SrcReg) override;
538540

539541
public:
542+
static bool MFMAVGPRForm;
543+
540544
struct VGPRSpillToAGPR {
541545
SmallVector<MCPhysReg, 32> Lanes;
542546
bool FullyAllocated = false;
@@ -1196,9 +1200,7 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
11961200

11971201
unsigned getMaxMemoryClusterDWords() const { return MaxMemoryClusterDWords; }
11981202

1199-
bool mayNeedAGPRs() const {
1200-
return MayNeedAGPRs;
1201-
}
1203+
unsigned getMinNumAGPRs() const { return MinNumAGPRs; }
12021204

12031205
// \returns true if a function has a use of AGPRs via inline asm or
12041206
// has a call which may use it.

llvm/lib/Target/AMDGPU/VOP3PInstructions.td

Lines changed: 33 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ class VOP3P_Mix_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR,
6767
class VOP3P_Mix_Profile_t16<VOPProfile P, VOP3Features Features = VOP3_REGULAR>
6868
: VOP3P_Mix_Profile<P, Features, 0> {
6969
let IsTrue16 = 1;
70-
let IsRealTrue16 = 1;
70+
let IsRealTrue16 = 1;
7171
let DstRC64 = getVALUDstForVT<P.DstVT, 1 /*IsTrue16*/, 1 /*IsVOP3Encoding*/>.ret;
7272
}
7373

@@ -950,7 +950,7 @@ class MFMA_F8F6F4_WithSizeTable_Helper<VOP3_Pseudo ps, string F8F8Op> :
950950
}
951951

952952
// Currently assumes scaled instructions never have abid
953-
class MAIFrag<SDPatternOperator Op, code pred, bit HasAbid = true, bit Scaled = false> : PatFrag <
953+
class MAIFrag<SDPatternOperator Op, bit HasAbid = true, bit Scaled = false> : PatFrag <
954954
!if(Scaled, (ops node:$src0, node:$src1, node:$src2, node:$cbsz, node:$blgp,
955955
node:$src0_modifiers, node:$scale_src0,
956956
node:$src1_modifiers, node:$scale_src1),
@@ -959,37 +959,30 @@ class MAIFrag<SDPatternOperator Op, code pred, bit HasAbid = true, bit Scaled =
959959
(ops node:$blgp))),
960960
!if(Scaled, (Op $src0, $src1, $src2, $cbsz, $blgp, $src0_modifiers, $scale_src0, $src1_modifiers, $scale_src1),
961961
!if(HasAbid, (Op $src0, $src1, $src2, $cbsz, $abid, $blgp),
962-
(Op $src0, $src1, $src2, $cbsz, $blgp))),
963-
pred
964-
>;
965-
966-
defvar MayNeedAGPRs = [{
967-
return MF->getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs();
968-
}];
969-
970-
defvar MayNeedAGPRs_gisel = [{
971-
return MF.getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs();
972-
}];
962+
(Op $src0, $src1, $src2, $cbsz, $blgp)))>;
973963

974-
defvar MayNotNeedAGPRs = [{
975-
return !MF->getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs();
976-
}];
964+
class CanUseAGPR_MAI<ValueType vt> {
965+
code PredicateCode = [{
966+
return !Subtarget->hasGFX90AInsts() ||
967+
(!SIMachineFunctionInfo::MFMAVGPRForm &&
968+
MF->getInfo<SIMachineFunctionInfo>()->getMinNumAGPRs() >=
969+
}] # !srl(vt.Size, 5) # ");";
977970

978-
defvar MayNotNeedAGPRs_gisel = [{
979-
return !MF.getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs();
980-
}];
971+
code GISelPredicateCode = [{
972+
return !Subtarget->hasGFX90AInsts() ||
973+
(!SIMachineFunctionInfo::MFMAVGPRForm &&
974+
MF.getInfo<SIMachineFunctionInfo>()->getMinNumAGPRs() >=
975+
}] # !srl(vt.Size, 5) # ");";
976+
}
981977

982-
class AgprMAIFrag<SDPatternOperator Op, bit HasAbid = true,
978+
class AgprMAIFrag<SDPatternOperator Op, ValueType vt, bit HasAbid = true,
983979
bit Scaled = false> :
984-
MAIFrag<Op, MayNeedAGPRs, HasAbid, Scaled> {
985-
let GISelPredicateCode = MayNeedAGPRs_gisel;
986-
}
980+
MAIFrag<Op, HasAbid, Scaled>,
981+
CanUseAGPR_MAI<vt>;
987982

988983
class VgprMAIFrag<SDPatternOperator Op, bit HasAbid = true,
989-
bit Scaled = false> :
990-
MAIFrag<Op, MayNotNeedAGPRs, HasAbid, Scaled> {
991-
let GISelPredicateCode = MayNotNeedAGPRs_gisel;
992-
}
984+
bit Scaled = false> :
985+
MAIFrag<Op, HasAbid, Scaled>;
993986

994987
let isAsCheapAsAMove = 1, isReMaterializable = 1 in {
995988
defm V_ACCVGPR_READ_B32 : VOP3Inst<"v_accvgpr_read_b32", VOPProfileAccRead>;
@@ -1037,16 +1030,19 @@ multiclass MAIInst<string OpName, string P, SDPatternOperator node = null_frag,
10371030
bit HasAbid = true,
10381031
bit Scaled = false> {
10391032
defvar NoDstOverlap = !cast<VOPProfileMAI>("VOPProfileMAI_" # P).NoDstOverlap;
1033+
defvar ProfileAGPR = !cast<VOPProfileMAI>("VOPProfileMAI_" # P);
1034+
defvar ProfileVGPR = !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD");
1035+
10401036

10411037
let isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 in {
10421038
// FP32 denorm mode is respected, rounding mode is not. Exceptions are not supported.
10431039
let Constraints = !if(NoDstOverlap, "@earlyclobber $vdst", "") in {
1044-
def _e64 : MAIInst<OpName, !cast<VOPProfileMAI>("VOPProfileMAI_" # P),
1045-
!if(!or(NoDstOverlap, !eq(node, null_frag)), null_frag, AgprMAIFrag<node, HasAbid, Scaled>), Scaled>,
1040+
def _e64 : MAIInst<OpName, ProfileAGPR,
1041+
!if(!or(NoDstOverlap, !eq(node, null_frag)), null_frag, AgprMAIFrag<node, ProfileAGPR.DstVT, HasAbid, Scaled>), Scaled>,
10461042
MFMATable<0, "AGPR", NAME # "_e64">;
10471043

10481044
let OtherPredicates = [isGFX90APlus], Mnemonic = OpName in
1049-
def _vgprcd_e64 : MAIInst<OpName # "_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD"),
1045+
def _vgprcd_e64 : MAIInst<OpName # "_vgprcd", ProfileVGPR,
10501046
!if(!or(NoDstOverlap, !eq(node, null_frag)), null_frag, VgprMAIFrag<node, HasAbid, Scaled>), Scaled>,
10511047
MFMATable<0, "VGPR", NAME # "_vgprcd_e64", NAME # "_e64">;
10521048
}
@@ -1055,12 +1051,12 @@ multiclass MAIInst<string OpName, string P, SDPatternOperator node = null_frag,
10551051
let Constraints = !if(NoDstOverlap, "$vdst = $src2", ""),
10561052
isConvertibleToThreeAddress = NoDstOverlap,
10571053
Mnemonic = OpName in {
1058-
def "_mac_e64" : MAIInst<OpName # "_mac", !cast<VOPProfileMAI>("VOPProfileMAI_" # P),
1059-
!if(!eq(node, null_frag), null_frag, AgprMAIFrag<node, HasAbid, Scaled>), Scaled>,
1054+
def "_mac_e64" : MAIInst<OpName # "_mac", ProfileAGPR,
1055+
!if(!eq(node, null_frag), null_frag, AgprMAIFrag<node, ProfileAGPR.DstVT, HasAbid, Scaled>), Scaled>,
10601056
MFMATable<1, "AGPR", NAME # "_e64", NAME # "_mac_e64">;
10611057

10621058
let OtherPredicates = [isGFX90APlus] in
1063-
def _mac_vgprcd_e64 : MAIInst<OpName # "_mac_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD"),
1059+
def _mac_vgprcd_e64 : MAIInst<OpName # "_mac_vgprcd", ProfileVGPR,
10641060
!if(!eq(node, null_frag), null_frag, VgprMAIFrag<node, HasAbid, Scaled>), Scaled>,
10651061
MFMATable<1, "VGPR", NAME # "_vgprcd_e64", NAME # "_mac_e64">;
10661062
}
@@ -1074,11 +1070,11 @@ multiclass ScaledMAIInst_mc<string OpName, string UnscaledOpName_, SDPatternOper
10741070
defvar UnscaledOpName = UnscaledOpName_#VariantSuffix;
10751071

10761072
defvar HasAbid = false;
1077-
1078-
defvar NoDstOverlap = !cast<VOPProfileMAI>(!cast<MAIInst>(UnscaledOpName#"_e64").Pfl).NoDstOverlap;
1073+
defvar Profile = !cast<VOPProfileMAI>(!cast<MAIInst>(UnscaledOpName#"_e64").Pfl);
1074+
defvar NoDstOverlap = Profile.NoDstOverlap;
10791075

10801076
def _e64 : ScaledMAIInst<OpName,
1081-
!cast<MAIInst>(UnscaledOpName#"_e64"), !if(NoDstOverlap, null_frag, AgprMAIFrag<node, HasAbid, true>)>,
1077+
!cast<MAIInst>(UnscaledOpName#"_e64"), !if(NoDstOverlap, null_frag, AgprMAIFrag<node, Profile.DstVT, HasAbid, true>)>,
10821078
MFMATable<0, "AGPR", NAME # "_e64">;
10831079

10841080
def _vgprcd_e64 : ScaledMAIInst<OpName # "_vgprcd",
@@ -1090,7 +1086,7 @@ multiclass ScaledMAIInst_mc<string OpName, string UnscaledOpName_, SDPatternOper
10901086
isConvertibleToThreeAddress = NoDstOverlap,
10911087
Mnemonic = UnscaledOpName_ in {
10921088
def _mac_e64 : ScaledMAIInst<OpName # "_mac",
1093-
!cast<MAIInst>(UnscaledOpName # "_mac_e64"), AgprMAIFrag<node, HasAbid, true>>,
1089+
!cast<MAIInst>(UnscaledOpName # "_mac_e64"), AgprMAIFrag<node, Profile.DstVT, HasAbid, true>>,
10941090
MFMATable<1, "AGPR", NAME # "_e64">;
10951091

10961092
def _mac_vgprcd_e64 : ScaledMAIInst<OpName # " _mac_vgprcd",

0 commit comments

Comments
 (0)