[AMDGPU] Update target helpers & GCNSchedStrategy for dynamic VGPRs

rovka · rovka · commit ea460637afd4 · 2025-03-06T10:54:15.000+01:00
In dynamic VGPR mode, we can allocate up to 8 blocks of either 16 or 32
VGPRs (based on a chip-wide setting which we can model with a Subtarget
feature). Update some of the subtarget helpers to reflect this.

In particular:
- getVGPRAllocGranule is set to the block size
- getAddresableNumVGPR will limit itself to 8 * size of a block

We also try to be more careful about how many VGPR blocks we allocate.
Therefore, when deciding if we should revert scheduling after a given
stage, we check that we haven't increased the number of VGPR blocks that
need to be allocated.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1245,6 +1245,12 @@ def FeatureDynamicVGPR : SubtargetFeature <"dynamic-vgpr",
   "Enable dynamic VGPR mode"
 >;
 
+def FeatureDynamicVGPRBlockSize32 : SubtargetFeature<"dynamic-vgpr-block-size-32",
+  "DynamicVGPRBlockSize32",
+  "true",
+  "Use a block size of 32 for dynamic VGPR allocation (default is 16)"
+>;
+
 // Dummy feature used to disable assembler instructions.
 def FeatureDisable : SubtargetFeature<"",
   "FeatureDisable","true",
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1452,6 +1452,16 @@ bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
   if (WavesAfter < DAG.MinOccupancy)
     return true;
 
+  // For dynamic VGPR mode, we don't want to waste any VGPR blocks.
+  if (ST.isDynamicVGPREnabled()) {
+    unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
+        &ST, PressureBefore.getVGPRNum(false));
+    unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
+        &ST, PressureAfter.getVGPRNum(false));
+    if (BlocksAfter > BlocksBefore)
+      return true;
+  }
+
   return false;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -191,6 +191,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   unsigned MaxHardClauseLength = 0;
   bool SupportsSRAMECC = false;
   bool DynamicVGPR = false;
+  bool DynamicVGPRBlockSize32 = false;
 
   // This should not be used directly. 'TargetID' tracks the dynamic settings
   // for SRAMECC.
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1154,6 +1154,9 @@ unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI,
   if (STI->getFeatureBits().test(FeatureGFX90AInsts))
     return 8;
 
+  if (STI->getFeatureBits().test(FeatureDynamicVGPR))
+    return STI->getFeatureBits().test(FeatureDynamicVGPRBlockSize32) ? 32 : 16;
+
   bool IsWave32 = EnableWavefrontSize32 ?
       *EnableWavefrontSize32 :
       STI->getFeatureBits().test(FeatureWavefrontSize32);
@@ -1195,6 +1198,9 @@ unsigned getAddressableNumArchVGPRs(const MCSubtargetInfo *STI) { return 256; }
 unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) {
   if (STI->getFeatureBits().test(FeatureGFX90AInsts))
     return 512;
+  if (STI->getFeatureBits().test(FeatureDynamicVGPR))
+    // On GFX12 we can allocate at most 8 blocks of VGPRs.
+    return 8 * getVGPRAllocGranule(STI);
   return getAddressableNumArchVGPRs(STI);
 }
 
diff --git a/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp b/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp
@@ -152,6 +152,24 @@ static void testGPRLimits(const char *RegName, bool TestW32W64,
   EXPECT_TRUE(ErrStr.empty()) << ErrStr;
 }
 
+static void testDynamicVGPRLimits(StringRef CPUName, StringRef FS,
+                                  TestFuncTy test) {
+  auto TM = createAMDGPUTargetMachine("amdgcn-amd-", CPUName,
+                                      "+dynamic-vgpr," + FS.str());
+  ASSERT_TRUE(TM) << "No target machine";
+
+  GCNSubtarget ST(TM->getTargetTriple(), std::string(TM->getTargetCPU()),
+                  std::string(TM->getTargetFeatureString()), *TM);
+  ASSERT_TRUE(ST.getFeatureBits().test(AMDGPU::FeatureDynamicVGPR));
+
+  std::stringstream Table;
+  bool Success = testAndRecord(Table, ST, test);
+  EXPECT_TRUE(Success && !PrintCpuRegLimits)
+      << CPUName << " dynamic VGPR " << FS
+      << ":\nOcc    MinVGPR        MaxVGPR\n"
+      << Table.str() << '\n';
+}
+
 TEST(AMDGPU, TestVGPRLimitsPerOccupancy) {
   auto test = [](std::stringstream &OS, unsigned Occ, const GCNSubtarget &ST) {
     unsigned MaxVGPRNum = ST.getAddressableNumVGPRs();
@@ -163,6 +181,50 @@ TEST(AMDGPU, TestVGPRLimitsPerOccupancy) {
   };
 
   testGPRLimits("VGPR", true, test);
+
+  testDynamicVGPRLimits("gfx1200", "+wavefrontsize32", test);
+  testDynamicVGPRLimits("gfx1200",
+                        "+wavefrontsize32,+dynamic-vgpr-block-size-32", test);
+}
+
+static void testAbsoluteLimits(StringRef CPUName, StringRef FS,
+                               unsigned ExpectedMinOcc, unsigned ExpectedMaxOcc,
+                               unsigned ExpectedMaxVGPRs) {
+  auto TM = createAMDGPUTargetMachine("amdgcn-amd-", CPUName, FS);
+  ASSERT_TRUE(TM) << "No target machine";
+
+  GCNSubtarget ST(TM->getTargetTriple(), std::string(TM->getTargetCPU()),
+                  std::string(TM->getTargetFeatureString()), *TM);
+
+  // Test function without attributes.
+  LLVMContext Context;
+  Module M("", Context);
+  Function *Func =
+      Function::Create(FunctionType::get(Type::getVoidTy(Context), false),
+                       GlobalValue::ExternalLinkage, "testFunc", &M);
+  Func->setCallingConv(CallingConv::AMDGPU_CS_Chain);
+  Func->addFnAttr("amdgpu-flat-work-group-size", "1,32");
+
+  auto Range = ST.getWavesPerEU(*Func);
+  EXPECT_EQ(ExpectedMinOcc, Range.first) << CPUName << ' ' << FS;
+  EXPECT_EQ(ExpectedMaxOcc, Range.second) << CPUName << ' ' << FS;
+  EXPECT_EQ(ExpectedMaxVGPRs, ST.getMaxNumVGPRs(*Func)) << CPUName << ' ' << FS;
+  EXPECT_EQ(ExpectedMaxVGPRs, ST.getAddressableNumVGPRs())
+      << CPUName << ' ' << FS;
+
+  // Function with requested 'amdgpu-waves-per-eu' in a valid range.
+  Func->addFnAttr("amdgpu-waves-per-eu", "10,12");
+  Range = ST.getWavesPerEU(*Func);
+  EXPECT_EQ(10u, Range.first) << CPUName << ' ' << FS;
+  EXPECT_EQ(12u, Range.second) << CPUName << ' ' << FS;
+}
+
+TEST(AMDGPU, TestOccupancyAbsoluteLimits) {
+  testAbsoluteLimits("gfx1200", "+wavefrontsize32", 1, 16, 256);
+  testAbsoluteLimits("gfx1200", "+wavefrontsize32,+dynamic-vgpr", 1, 16, 128);
+  testAbsoluteLimits(
+      "gfx1200", "+wavefrontsize32,+dynamic-vgpr,+dynamic-vgpr-block-size-32",
+      1, 16, 256);
 }
 
 static const char *printSubReg(const TargetRegisterInfo &TRI, unsigned SubReg) {
diff --git a/llvm/unittests/Target/AMDGPU/CMakeLists.txt b/llvm/unittests/Target/AMDGPU/CMakeLists.txt
@@ -13,6 +13,7 @@ set(LLVM_LINK_COMPONENTS
   Core
   GlobalISel
   MC
+  MIRParser
   Support
   TargetParser
   )

Original file line number	Diff line number	Diff line change
`@@ -13,6 +13,7 @@ set(LLVM_LINK_COMPONENTS`
`13`	`13`	`Core`
`14`	`14`	`GlobalISel`
`15`	`15`	`MC`
	`16`	`+ MIRParser`
`16`	`17`	`Support`
`17`	`18`	`TargetParser`
`18`	`19`	`)`