llvm · lucas-rami · Jan 23, 2025 · Dec 12, 2024 · Jan 21, 2025 · Jan 21, 2025
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -456,7 +456,7 @@ void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
       uint64_t NumSGPRsForWavesPerEU = std::max(
           {NumSgpr, (uint64_t)1, (uint64_t)STM.getMinNumSGPRs(MaxWaves)});
       const MCExpr *OccupancyExpr = AMDGPUMCExpr::createOccupancy(
-          STM.computeOccupancy(F, MFI.getLDSSize()),
+          STM.getOccupancyWithWorkGroupSizes(*MF).second,
           MCConstantExpr::create(NumSGPRsForWavesPerEU, OutContext),
           MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext), STM,
           OutContext);
@@ -1175,22 +1175,10 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   // Make clamp modifier on NaN input returns 0.
   ProgInfo.DX10Clamp = Mode.DX10Clamp;
 
-  unsigned LDSAlignShift;
-  if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize163840)) {
-    // LDS is allocated in 320 dword blocks.
-    LDSAlignShift = 11;
-  } else if (STM.getFeatureBits().test(
-                 FeatureAddressableLocalMemorySize65536)) {
-    // LDS is allocated in 128 dword blocks.
-    LDSAlignShift = 9;
-  } else {
-    // LDS is allocated in 64 dword blocks.
-    LDSAlignShift = 8;
-  }
-
   ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
   ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs();
 
+  unsigned LDSAlignShift = Log2_32_Ceil(STM.getLDSAllocGranularity());
   ProgInfo.LDSSize = MFI->getLDSSize();
   ProgInfo.LDSBlocks =
       alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
@@ -1272,8 +1260,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   }
 
   ProgInfo.Occupancy = AMDGPUMCExpr::createOccupancy(
-      STM.computeOccupancy(F, ProgInfo.LDSSize), ProgInfo.NumSGPRsForWavesPerEU,
-      ProgInfo.NumVGPRsForWavesPerEU, STM, Ctx);
+      STM.computeOccupancy(F, ProgInfo.LDSSize).second,
+      ProgInfo.NumSGPRsForWavesPerEU, ProgInfo.NumVGPRsForWavesPerEU, STM, Ctx);
 
   const auto [MinWEU, MaxWEU] =
       AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -1344,7 +1344,7 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
   }
 
   unsigned MaxOccupancy =
-      ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage, F);
+      ST.getOccupancyWithWorkGroupSizes(CurrentLocalMemUsage, F, TM).second;
 
   // Restrict local memory usage so that we don't drastically reduce occupancy,
   // unless it is already significantly reduced.

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -55,55 +55,96 @@ AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
   return getLocalMemorySize() / WorkGroupsPerCU;
 }
 
-// FIXME: Should return min,max range.
-//
-// Returns the maximum occupancy, in number of waves per SIMD / EU, that can
-// be achieved when only the given function is running on the machine; and
-// taking into account the overall number of wave slots, the (maximum) workgroup
-// size, and the per-workgroup LDS allocation size.
-unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
-  const Function &F) const {
-  const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
-  const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
-  if (!MaxWorkGroupsPerCu)
-    return 0;
-
-  const unsigned WaveSize = getWavefrontSize();
-
-  // FIXME: Do we need to account for alignment requirement of LDS rounding the
-  // size up?
-  // Compute restriction based on LDS usage
-  unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
-
-  // This can be queried with more LDS than is possible, so just assume the
-  // worst.
-  if (NumGroups == 0)
-    return 1;
-
-  NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
-
-  // Round to the number of waves per CU.
-  const unsigned MaxGroupNumWaves = divideCeil(MaxWorkGroupSize, WaveSize);
-  unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
-
-  // Number of waves per EU (SIMD).
-  MaxWaves = divideCeil(MaxWaves, getEUsPerCU());
+std::pair<unsigned, unsigned> AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(
+    uint32_t LDSBytes, const Function &F, const TargetMachine &TM) const {
+  // Compute occupancy restriction based on LDS usage.
+  if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
+    // For GCN subtargets, LDS size must be aligned on allocation granularity.
+    const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+    LDSBytes = alignTo(LDSBytes, ST.getLDSAllocGranularity());
+  }
 
-  // Clamp to the maximum possible number of waves.
-  MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
+  const unsigned MaxWGsLDS = getLocalMemorySize() / std::max(LDSBytes, 1u);
+
+  // Queried LDS size may be larger than available on a CU, in which case we
+  // consider the only achievable occupancy to be 1, in line with what we
+  // consider the occupancy to be when the number of requested registers in a
+  // particular bank is higher than the number of available ones in that bank.
+  if (!MaxWGsLDS)
+    return {1, 1};
+
+  const unsigned WaveSize = getWavefrontSize(), WavesPerEU = getMaxWavesPerEU();
+
+  auto PropsFromWGSize = [=](unsigned WGSize)
+      -> std::tuple<const unsigned, const unsigned, unsigned> {
+    unsigned WavesPerWG = divideCeil(WGSize, WaveSize);
+    unsigned WGsPerCU = std::min(getMaxWorkGroupsPerCU(WGSize), MaxWGsLDS);
+    return {WavesPerWG, WGsPerCU, WavesPerWG * WGsPerCU};
+  };
+
+  // The maximum group size will generally yield the minimum number of
+  // workgroups, maximum number of waves, and minimum occupancy. The opposite is
+  // generally true for the minimum group size. LDS or barrier ressource
+  // limitations can flip those minimums/maximums.
+  const auto [MinWGSize, MaxWGSize] = getFlatWorkGroupSizes(F);
+  auto [MinWavesPerWG, MaxWGsPerCU, MaxWavesPerCU] = PropsFromWGSize(MinWGSize);
+  auto [MaxWavesPerWG, MinWGsPerCU, MinWavesPerCU] = PropsFromWGSize(MaxWGSize);
+
+  // It is possible that we end up with flipped minimum and maximum number of
+  // waves per CU when the number of minimum/maximum concurrent groups on the CU
+  // is limited by LDS usage or barrier resources.
+  if (MinWavesPerCU >= MaxWavesPerCU) {
+    std::swap(MinWavesPerCU, MaxWavesPerCU);
+  } else {
+    const unsigned WaveSlotsPerCU = WavesPerEU * getEUsPerCU();
+
+    // Look for a potential smaller group size than the maximum which decreases
+    // the concurrent number of waves on the CU for the same number of
+    // concurrent workgroups on the CU.
+    unsigned MinWavesPerCUForWGSize =
+        divideCeil(WaveSlotsPerCU, MinWGsPerCU + 1) * MinWGsPerCU;
+    if (MinWavesPerCU > MinWavesPerCUForWGSize) {
+      unsigned ExcessSlots = MinWavesPerCU - MinWavesPerCUForWGSize;
+      if (unsigned ExcessSlotsPerWG = ExcessSlots / MinWGsPerCU) {
+        // There may exist a smaller group size than the maximum that achieves
+        // the minimum number of waves per CU. This group size is the largest
+        // possible size that requires MaxWavesPerWG - E waves where E is
+        // maximized under the following constraints.
+        // 1. 0 <= E <= ExcessSlotsPerWG
+        // 2. (MaxWavesPerWG - E) * WaveSize >= MinWGSize
+        MinWavesPerCU -= MinWGsPerCU * std::min(ExcessSlotsPerWG,
+                                                MaxWavesPerWG - MinWavesPerWG);
+      }
+    }
 
-  // FIXME: Needs to be a multiple of the group size?
-  //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
+    // Look for a potential larger group size than the minimum which increases
+    // the concurrent number of waves on the CU for the same number of
+    // concurrent workgroups on the CU.
+    unsigned LeftoverSlots = WaveSlotsPerCU - MaxWGsPerCU * MinWavesPerWG;
+    if (unsigned LeftoverSlotsPerWG = LeftoverSlots / MaxWGsPerCU) {
+      // There may exist a larger group size than the minimum that achieves the
+      // maximum number of waves per CU. This group size is the smallest
+      // possible size that requires MinWavesPerWG + L waves where L is
+      // maximized under the following constraints.
+      // 1. 0 <= L <= LeftoverSlotsPerWG
+      // 2. (MinWavesPerWG + L - 1) * WaveSize <= MaxWGSize
+      MaxWavesPerCU += MaxWGsPerCU * std::min(LeftoverSlotsPerWG,
+                                              ((MaxWGSize - 1) / WaveSize) + 1 -
+                                                  MinWavesPerWG);
+    }
+  }
 
-  assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
-         "computed invalid occupancy");
-  return MaxWaves;
+  // Return the minimum/maximum number of waves on any EU, assuming that all
+  // wavefronts are spread across all EUs as evenly as possible.
+  return {std::clamp(MinWavesPerCU / getEUsPerCU(), 1U, WavesPerEU),
+          std::clamp(divideCeil(MaxWavesPerCU, getEUsPerCU()), 1U, WavesPerEU)};
 }
 
-unsigned
-AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
+std::pair<unsigned, unsigned> AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(
+    const MachineFunction &MF) const {
   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
-  return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
+  return getOccupancyWithWorkGroupSizes(MFI->getLDSSize(), MF.getFunction(),
+                                        MF.getTarget());
 }
 
 std::pair<unsigned, unsigned>

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -127,11 +127,22 @@ class AMDGPUSubtarget {
   unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
                                            const Function &) const;
 
-  /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if
-  /// the given LDS memory size is the only constraint.
-  unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const;
-
-  unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const;
+  /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
+  /// be achieved when the only function running on a CU is \p F and each
+  /// workgroup running the function requires \p LDSBytes bytes of LDS space.
+  /// This notably depends on the range of allowed flat group sizes for the
+  /// function and hardware characteristics.
+  std::pair<unsigned, unsigned>
+  getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F,
+                                 const TargetMachine &TM) const;
+
+  /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
+  /// be achieved when the only function running on a CU is \p MF. This notably
+  /// depends on the range of allowed flat group sizes for the function, the
+  /// amount of per-workgroup LDS space required by the function, and hardware
+  /// characteristics.
+  std::pair<unsigned, unsigned>
+  getOccupancyWithWorkGroupSizes(const MachineFunction &MF) const;
 
   bool isAmdHsaOS() const {
     return TargetTriple.getOS() == Triple::AMDHSA;

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1721,7 +1721,7 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
 
   if (MFI->Occupancy == 0) {
     // Fixup the subtarget dependent default value.
-    MFI->Occupancy = ST.computeOccupancy(MF.getFunction(), MFI->getLDSSize());
+    MFI->Occupancy = ST.getOccupancyWithWorkGroupSizes(MF).second;
   }
 
   auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) {

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1089,9 +1089,8 @@ bool PreRARematStage::initGCNSchedStage() {
     return false;
 
   const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
-  // Check maximum occupancy
-  if (ST.computeOccupancy(MF.getFunction(), MFI.getLDSSize()) ==
-      DAG.MinOccupancy)
+  // Rematerialization will not help if occupancy is not limited by reg usage.
+  if (ST.getOccupancyWithWorkGroupSizes(MF).second == DAG.MinOccupancy)
     return false;
 
   // FIXME: This pass will invalidate cached MBBLiveIns for regions
@@ -1272,8 +1271,8 @@ void GCNSchedStage::checkScheduling() {
     return;
   }
 
-  unsigned TargetOccupancy =
-      std::min(S.getTargetOccupancy(), ST.getOccupancyWithLocalMemSize(MF));
+  unsigned TargetOccupancy = std::min(
+      S.getTargetOccupancy(), ST.getOccupancyWithWorkGroupSizes(MF).second);
   unsigned WavesAfter =
       std::min(TargetOccupancy, PressureAfter.getOccupancy(ST));
   unsigned WavesBefore =

diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -405,16 +405,25 @@ unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
   return getBaseReservedNumSGPRs(KernelUsesFlatScratch);
 }
 
-unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
-                                        unsigned NumSGPRs,
-                                        unsigned NumVGPRs) const {
-  unsigned Occupancy =
-      std::min(getMaxWavesPerEU(), getOccupancyWithLocalMemSize(LDSSize, F));
-  if (NumSGPRs)
-    Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
-  if (NumVGPRs)
-    Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
-  return Occupancy;
+std::pair<unsigned, unsigned>
+GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
+                               unsigned NumSGPRs, unsigned NumVGPRs) const {
+  auto [MinOcc, MaxOcc] =
+      getOccupancyWithWorkGroupSizes(LDSSize, F, TLInfo.getTargetMachine());
+  unsigned SGPROcc = getOccupancyWithNumSGPRs(NumSGPRs);
+  unsigned VGPROcc = getOccupancyWithNumVGPRs(NumVGPRs);
+
+  // Maximum occupancy may be further limited by high SGPR/VGPR usage.
+  MaxOcc = std::min(MaxOcc, std::min(SGPROcc, VGPROcc));
+  return {std::min(MinOcc, MaxOcc), MaxOcc};
+}
+
+unsigned GCNSubtarget::getLDSAllocGranularity() const {
+  if (getFeatureBits().test(AMDGPU::FeatureAddressableLocalMemorySize163840))
+    return 1280; // LDS is allocated in 320 dword blocks.
+  if (getFeatureBits().test(AMDGPU::FeatureAddressableLocalMemorySize65536))
+    return 512; // LDS is allocated in 128 dword blocks.
+  return 256;   // LDS is allocated in 64 dword blocks.
 }
 
 unsigned GCNSubtarget::getBaseMaxNumSGPRs(

diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1368,12 +1368,21 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   /// VGPRs
   unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
 
-  /// Return occupancy for the given function. Used LDS and a number of
-  /// registers if provided.
-  /// Note, occupancy can be affected by the scratch allocation as well, but
+  /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
+  /// be achieved when the only function running on a CU is \p F, each workgroup
+  /// uses \p LDSSize bytes of LDS, and each wave uses \p NumSGPRs SGPRs and \p
+  /// NumVGPRs VGPRs. The flat workgroup sizes associated to the function are a
+  /// range, so this returns a range as well.
+  ///
+  /// Note that occupancy can be affected by the scratch allocation as well, but
   /// we do not have enough information to compute it.
-  unsigned computeOccupancy(const Function &F, unsigned LDSSize = 0,
-                            unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const;
+  std::pair<unsigned, unsigned> computeOccupancy(const Function &F,
+                                                 unsigned LDSSize = 0,
+                                                 unsigned NumSGPRs = 0,
+                                                 unsigned NumVGPRs = 0) const;
+
+  /// Returns the LDS's allocation granularity in bytes.
+  unsigned getLDSAllocGranularity() const;
 
   /// \returns true if the flat_scratch register should be initialized with the
   /// pointer to the wave's scratch memory rather than a size and offset.

diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -48,7 +48,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
   MaxNumWorkGroups = ST.getMaxNumWorkGroups(F);
   assert(MaxNumWorkGroups.size() == 3);
 
-  Occupancy = ST.computeOccupancy(F, getLDSSize());
+  Occupancy = ST.computeOccupancy(F, getLDSSize()).second;
   CallingConv::ID CC = F.getCallingConv();
 
   VRegFlags.reserve(1024);
@@ -185,8 +185,7 @@ MachineFunctionInfo *SIMachineFunctionInfo::clone(
 void SIMachineFunctionInfo::limitOccupancy(const MachineFunction &MF) {
   limitOccupancy(getMaxWavesPerEU());
   const GCNSubtarget& ST = MF.getSubtarget<GCNSubtarget>();
-  limitOccupancy(ST.getOccupancyWithLocalMemSize(getLDSSize(),
-                 MF.getFunction()));
+  limitOccupancy(ST.getOccupancyWithWorkGroupSizes(MF).second);
 }
 
 Register SIMachineFunctionInfo::addPrivateSegmentBuffer(

diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -3642,18 +3642,15 @@ bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
 
 unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
                                              MachineFunction &MF) const {
-  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-
-  unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(),
-                                                       MF.getFunction());
+  unsigned MinOcc = ST.getOccupancyWithWorkGroupSizes(MF).first;
   switch (RC->getID()) {
   default:
     return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF);
   case AMDGPU::VGPR_32RegClassID:
-    return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF));
+    return std::min(ST.getMaxNumVGPRs(MinOcc), ST.getMaxNumVGPRs(MF));
   case AMDGPU::SGPR_32RegClassID:
   case AMDGPU::SGPR_LO16RegClassID:
-    return std::min(ST.getMaxNumSGPRs(Occupancy, true), ST.getMaxNumSGPRs(MF));
+    return std::min(ST.getMaxNumSGPRs(MinOcc, true), ST.getMaxNumSGPRs(MF));
   }
 }