Skip to content

Commit ea46063

Browse files
committed
[AMDGPU] Update target helpers & GCNSchedStrategy for dynamic VGPRs
In dynamic VGPR mode, we can allocate up to 8 blocks of either 16 or 32 VGPRs (based on a chip-wide setting which we can model with a Subtarget feature). Update some of the subtarget helpers to reflect this. In particular: - getVGPRAllocGranule is set to the block size - getAddresableNumVGPR will limit itself to 8 * size of a block We also try to be more careful about how many VGPR blocks we allocate. Therefore, when deciding if we should revert scheduling after a given stage, we check that we haven't increased the number of VGPR blocks that need to be allocated.
1 parent b2a7bdc commit ea46063

File tree

6 files changed

+86
-0
lines changed

6 files changed

+86
-0
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1245,6 +1245,12 @@ def FeatureDynamicVGPR : SubtargetFeature <"dynamic-vgpr",
12451245
"Enable dynamic VGPR mode"
12461246
>;
12471247

1248+
def FeatureDynamicVGPRBlockSize32 : SubtargetFeature<"dynamic-vgpr-block-size-32",
1249+
"DynamicVGPRBlockSize32",
1250+
"true",
1251+
"Use a block size of 32 for dynamic VGPR allocation (default is 16)"
1252+
>;
1253+
12481254
// Dummy feature used to disable assembler instructions.
12491255
def FeatureDisable : SubtargetFeature<"",
12501256
"FeatureDisable","true",

llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1452,6 +1452,16 @@ bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
14521452
if (WavesAfter < DAG.MinOccupancy)
14531453
return true;
14541454

1455+
// For dynamic VGPR mode, we don't want to waste any VGPR blocks.
1456+
if (ST.isDynamicVGPREnabled()) {
1457+
unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
1458+
&ST, PressureBefore.getVGPRNum(false));
1459+
unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
1460+
&ST, PressureAfter.getVGPRNum(false));
1461+
if (BlocksAfter > BlocksBefore)
1462+
return true;
1463+
}
1464+
14551465
return false;
14561466
}
14571467

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
191191
unsigned MaxHardClauseLength = 0;
192192
bool SupportsSRAMECC = false;
193193
bool DynamicVGPR = false;
194+
bool DynamicVGPRBlockSize32 = false;
194195

195196
// This should not be used directly. 'TargetID' tracks the dynamic settings
196197
// for SRAMECC.

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1154,6 +1154,9 @@ unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI,
11541154
if (STI->getFeatureBits().test(FeatureGFX90AInsts))
11551155
return 8;
11561156

1157+
if (STI->getFeatureBits().test(FeatureDynamicVGPR))
1158+
return STI->getFeatureBits().test(FeatureDynamicVGPRBlockSize32) ? 32 : 16;
1159+
11571160
bool IsWave32 = EnableWavefrontSize32 ?
11581161
*EnableWavefrontSize32 :
11591162
STI->getFeatureBits().test(FeatureWavefrontSize32);
@@ -1195,6 +1198,9 @@ unsigned getAddressableNumArchVGPRs(const MCSubtargetInfo *STI) { return 256; }
11951198
unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) {
11961199
if (STI->getFeatureBits().test(FeatureGFX90AInsts))
11971200
return 512;
1201+
if (STI->getFeatureBits().test(FeatureDynamicVGPR))
1202+
// On GFX12 we can allocate at most 8 blocks of VGPRs.
1203+
return 8 * getVGPRAllocGranule(STI);
11981204
return getAddressableNumArchVGPRs(STI);
11991205
}
12001206

llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,24 @@ static void testGPRLimits(const char *RegName, bool TestW32W64,
152152
EXPECT_TRUE(ErrStr.empty()) << ErrStr;
153153
}
154154

155+
static void testDynamicVGPRLimits(StringRef CPUName, StringRef FS,
156+
TestFuncTy test) {
157+
auto TM = createAMDGPUTargetMachine("amdgcn-amd-", CPUName,
158+
"+dynamic-vgpr," + FS.str());
159+
ASSERT_TRUE(TM) << "No target machine";
160+
161+
GCNSubtarget ST(TM->getTargetTriple(), std::string(TM->getTargetCPU()),
162+
std::string(TM->getTargetFeatureString()), *TM);
163+
ASSERT_TRUE(ST.getFeatureBits().test(AMDGPU::FeatureDynamicVGPR));
164+
165+
std::stringstream Table;
166+
bool Success = testAndRecord(Table, ST, test);
167+
EXPECT_TRUE(Success && !PrintCpuRegLimits)
168+
<< CPUName << " dynamic VGPR " << FS
169+
<< ":\nOcc MinVGPR MaxVGPR\n"
170+
<< Table.str() << '\n';
171+
}
172+
155173
TEST(AMDGPU, TestVGPRLimitsPerOccupancy) {
156174
auto test = [](std::stringstream &OS, unsigned Occ, const GCNSubtarget &ST) {
157175
unsigned MaxVGPRNum = ST.getAddressableNumVGPRs();
@@ -163,6 +181,50 @@ TEST(AMDGPU, TestVGPRLimitsPerOccupancy) {
163181
};
164182

165183
testGPRLimits("VGPR", true, test);
184+
185+
testDynamicVGPRLimits("gfx1200", "+wavefrontsize32", test);
186+
testDynamicVGPRLimits("gfx1200",
187+
"+wavefrontsize32,+dynamic-vgpr-block-size-32", test);
188+
}
189+
190+
static void testAbsoluteLimits(StringRef CPUName, StringRef FS,
191+
unsigned ExpectedMinOcc, unsigned ExpectedMaxOcc,
192+
unsigned ExpectedMaxVGPRs) {
193+
auto TM = createAMDGPUTargetMachine("amdgcn-amd-", CPUName, FS);
194+
ASSERT_TRUE(TM) << "No target machine";
195+
196+
GCNSubtarget ST(TM->getTargetTriple(), std::string(TM->getTargetCPU()),
197+
std::string(TM->getTargetFeatureString()), *TM);
198+
199+
// Test function without attributes.
200+
LLVMContext Context;
201+
Module M("", Context);
202+
Function *Func =
203+
Function::Create(FunctionType::get(Type::getVoidTy(Context), false),
204+
GlobalValue::ExternalLinkage, "testFunc", &M);
205+
Func->setCallingConv(CallingConv::AMDGPU_CS_Chain);
206+
Func->addFnAttr("amdgpu-flat-work-group-size", "1,32");
207+
208+
auto Range = ST.getWavesPerEU(*Func);
209+
EXPECT_EQ(ExpectedMinOcc, Range.first) << CPUName << ' ' << FS;
210+
EXPECT_EQ(ExpectedMaxOcc, Range.second) << CPUName << ' ' << FS;
211+
EXPECT_EQ(ExpectedMaxVGPRs, ST.getMaxNumVGPRs(*Func)) << CPUName << ' ' << FS;
212+
EXPECT_EQ(ExpectedMaxVGPRs, ST.getAddressableNumVGPRs())
213+
<< CPUName << ' ' << FS;
214+
215+
// Function with requested 'amdgpu-waves-per-eu' in a valid range.
216+
Func->addFnAttr("amdgpu-waves-per-eu", "10,12");
217+
Range = ST.getWavesPerEU(*Func);
218+
EXPECT_EQ(10u, Range.first) << CPUName << ' ' << FS;
219+
EXPECT_EQ(12u, Range.second) << CPUName << ' ' << FS;
220+
}
221+
222+
TEST(AMDGPU, TestOccupancyAbsoluteLimits) {
223+
testAbsoluteLimits("gfx1200", "+wavefrontsize32", 1, 16, 256);
224+
testAbsoluteLimits("gfx1200", "+wavefrontsize32,+dynamic-vgpr", 1, 16, 128);
225+
testAbsoluteLimits(
226+
"gfx1200", "+wavefrontsize32,+dynamic-vgpr,+dynamic-vgpr-block-size-32",
227+
1, 16, 256);
166228
}
167229

168230
static const char *printSubReg(const TargetRegisterInfo &TRI, unsigned SubReg) {

llvm/unittests/Target/AMDGPU/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ set(LLVM_LINK_COMPONENTS
1313
Core
1414
GlobalISel
1515
MC
16+
MIRParser
1617
Support
1718
TargetParser
1819
)

0 commit comments

Comments
 (0)