jrbyrnes
diff --git a/‎llvm/lib/CodeGen/RegAllocGreedy.cpp‎
Lines changed: 7 additions & 0 deletions b/‎llvm/lib/CodeGen/RegAllocGreedy.cpp‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp‎
Lines changed: 133 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp‎
Lines changed: 133 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h‎
Lines changed: 7 additions & 3 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h‎
Lines changed: 7 additions & 3 deletions
@@ -132,6 +132,12 @@ static cl::opt<bool> GreedyReverseLocalAssignment(
              "shorter local live ranges will tend to be allocated first"),
     cl::Hidden);
 
+static cl::opt<bool> ForceLocalAssignment(
+    "force-local-assignment",
+    cl::desc("Force allocation order of local live ranges, such that "
+             "shorter local live ranges will tend to be allocated first"),
+    cl::Hidden);
+
 static cl::opt<unsigned> SplitThresholdForRegWithHint(
     "split-threshold-for-reg-with-hint",
     cl::desc("The threshold for splitting a virtual register with a hint, in "
@@ -456,6 +462,7 @@ unsigned DefaultPriorityAdvisor::getPriority(const LiveInterval &LI) const {
                         (Size / SlotIndex::InstrDist) >
                             (2 * RegClassInfo.getNumAllocatableRegs(&RC)));
     unsigned GlobalBit = 0;
+    ForceGlobal &= !ForceLocalAssignment;
 
     if (Stage == RS_Assign && !ForceGlobal && !LI.empty() &&
         LIS->intervalIsInOneMBB(LI)) {
 
@@ -55,6 +55,58 @@ AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
   return getLocalMemorySize() / WorkGroupsPerCU;
 }
 
+// FIXME: Should return min,max range.
+//
+// Returns the maximum occupancy, in number of waves per SIMD / EU, that can
+// be achieved when only the given function is running on the machine; and
+// taking into account the overall number of wave slots, the (maximum) workgroup
+// size, and the per-workgroup LDS allocation size.
+unsigned
+AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
+                                              const Function &F) const {
+  const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
+  const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
+  if (!MaxWorkGroupsPerCu)
+    return 0;
+
+  const unsigned WaveSize = getWavefrontSize();
+
+  // FIXME: Do we need to account for alignment requirement of LDS rounding the
+  // size up?
+  // Compute restriction based on LDS usage
+  unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
+
+  // This can be queried with more LDS than is possible, so just assume the
+  // worst.
+  if (NumGroups == 0)
+    return 1;
+
+  NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
+
+  // Round to the number of waves per CU.
+  const unsigned MaxGroupNumWaves = divideCeil(MaxWorkGroupSize, WaveSize);
+  unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
+
+  // Number of waves per EU (SIMD).
+  MaxWaves = divideCeil(MaxWaves, getEUsPerCU());
+
+  // Clamp to the maximum possible number of waves.
+  MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
+
+  // FIXME: Needs to be a multiple of the group size?
+  // MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
+
+  assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
+         "computed invalid occupancy");
+  return MaxWaves;
+}
+
+unsigned
+AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
+  const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
+}
+
 std::pair<unsigned, unsigned> AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(
     uint32_t LDSBytes, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
 
@@ -135,6 +187,87 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(
           std::clamp(divideCeil(MaxWavesPerCU, getEUsPerCU()), 1U, WavesPerEU)};
 }
 
+
+std::pair<unsigned, unsigned>
+AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(uint32_t LDSBytes,
+                                                const Function &F) const {
+  // FIXME: We should take into account the LDS allocation granularity.
+  const unsigned MaxWGsLDS = getLocalMemorySize() / std::max(LDSBytes, 1u);
+
+  // Queried LDS size may be larger than available on a CU, in which case we
+  // consider the only achievable occupancy to be 1, in line with what we
+  // consider the occupancy to be when the number of requested registers in a
+  // particular bank is higher than the number of available ones in that bank.
+  if (!MaxWGsLDS)
+    return {1, 1};
+
+  const unsigned WaveSize = getWavefrontSize(), WavesPerEU = getMaxWavesPerEU();
+
+  auto PropsFromWGSize = [=](unsigned WGSize)
+      -> std::tuple<const unsigned, const unsigned, unsigned> {
+    unsigned WavesPerWG = divideCeil(WGSize, WaveSize);
+    unsigned WGsPerCU = std::min(getMaxWorkGroupsPerCU(WGSize), MaxWGsLDS);
+    return {WavesPerWG, WGsPerCU, WavesPerWG * WGsPerCU};
+  };
+
+  // The maximum group size will generally yield the minimum number of
+  // workgroups, maximum number of waves, and minimum occupancy. The opposite is
+  // generally true for the minimum group size. LDS or barrier ressource
+  // limitations can flip those minimums/maximums.
+  const auto [MinWGSize, MaxWGSize] = getFlatWorkGroupSizes(F);
+  auto [MinWavesPerWG, MaxWGsPerCU, MaxWavesPerCU] = PropsFromWGSize(MinWGSize);
+  auto [MaxWavesPerWG, MinWGsPerCU, MinWavesPerCU] = PropsFromWGSize(MaxWGSize);
+
+  // It is possible that we end up with flipped minimum and maximum number of
+  // waves per CU when the number of minimum/maximum concurrent groups on the CU
+  // is limited by LDS usage or barrier resources.
+  if (MinWavesPerCU >= MaxWavesPerCU) {
+    std::swap(MinWavesPerCU, MaxWavesPerCU);
+  } else {
+    const unsigned WaveSlotsPerCU = WavesPerEU * getEUsPerCU();
+
+    // Look for a potential smaller group size than the maximum which decreases
+    // the concurrent number of waves on the CU for the same number of
+    // concurrent workgroups on the CU.
+    unsigned MinWavesPerCUForWGSize =
+        divideCeil(WaveSlotsPerCU, MinWGsPerCU + 1) * MinWGsPerCU;
+    if (MinWavesPerCU > MinWavesPerCUForWGSize) {
+      unsigned ExcessSlots = MinWavesPerCU - MinWavesPerCUForWGSize;
+      if (unsigned ExcessSlotsPerWG = ExcessSlots / MinWGsPerCU) {
+        // There may exist a smaller group size than the maximum that achieves
+        // the minimum number of waves per CU. This group size is the largest
+        // possible size that requires MaxWavesPerWG - E waves where E is
+        // maximized under the following constraints.
+        // 1. 0 <= E <= ExcessSlotsPerWG
+        // 2. (MaxWavesPerWG - E) * WaveSize >= MinWGSize
+        MinWavesPerCU -= MinWGsPerCU * std::min(ExcessSlotsPerWG,
+                                                MaxWavesPerWG - MinWavesPerWG);
+      }
+    }
+
+    // Look for a potential larger group size than the minimum which increases
+    // the concurrent number of waves on the CU for the same number of
+    // concurrent workgroups on the CU.
+    unsigned LeftoverSlots = WaveSlotsPerCU - MaxWGsPerCU * MinWavesPerWG;
+    if (unsigned LeftoverSlotsPerWG = LeftoverSlots / MaxWGsPerCU) {
+      // There may exist a larger group size than the minimum that achieves the
+      // maximum number of waves per CU. This group size is the smallest
+      // possible size that requires MinWavesPerWG + L waves where L is
+      // maximized under the following constraints.
+      // 1. 0 <= L <= LeftoverSlotsPerWG
+      // 2. (MinWavesPerWG + L - 1) * WaveSize <= MaxWGSize
+      MaxWavesPerCU += MaxWGsPerCU * std::min(LeftoverSlotsPerWG,
+                                              ((MaxWGSize - 1) / WaveSize) + 1 -
+                                                  MinWavesPerWG);
+    }
+  }
+
+  // Return the minimum/maximum number of waves on any EU, assuming that all
+  // wavefronts are spread across all EUs as evenly as possible.
+  return {std::clamp(MinWavesPerCU / getEUsPerCU(), 1U, WavesPerEU),
+          std::clamp(divideCeil(MaxWavesPerCU, getEUsPerCU()), 1U, WavesPerEU)};
+}
+
 std::pair<unsigned, unsigned> AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(
     const MachineFunction &MF) const {
   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
@@ -85,6 +85,11 @@ class AMDGPUSubtarget {
   static const AMDGPUSubtarget &get(const TargetMachine &TM,
                                     const Function &F);
 
+  unsigned getOccupancyWithLocalMemSize(uint32_t Bytes,
+                                        const Function &F) const;
+
+  unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const;
+
   /// \returns Default range flat work group size for a calling convention.
   std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const;
 
@@ -143,9 +148,8 @@ class AMDGPUSubtarget {
   /// This notably depends on the range of allowed flat group sizes for the
   /// function and hardware characteristics.
   std::pair<unsigned, unsigned>
-  getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const {
-    return getOccupancyWithWorkGroupSizes(LDSBytes, getFlatWorkGroupSizes(F));
-  }
+  getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const;
+
 
   /// Overload which uses the specified values for the flat work group sizes,
   /// rather than querying the function itself. \p FlatWorkGroupSizes should