@@ -55,6 +55,58 @@ AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
5555 return getLocalMemorySize () / WorkGroupsPerCU;
5656}
5757
58+ // FIXME: Should return min,max range.
59+ //
60+ // Returns the maximum occupancy, in number of waves per SIMD / EU, that can
61+ // be achieved when only the given function is running on the machine; and
62+ // taking into account the overall number of wave slots, the (maximum) workgroup
63+ // size, and the per-workgroup LDS allocation size.
64+ unsigned
65+ AMDGPUSubtarget::getOccupancyWithLocalMemSize (uint32_t Bytes,
66+ const Function &F) const {
67+ const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes (F).second ;
68+ const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU (MaxWorkGroupSize);
69+ if (!MaxWorkGroupsPerCu)
70+ return 0 ;
71+
72+ const unsigned WaveSize = getWavefrontSize ();
73+
74+ // FIXME: Do we need to account for alignment requirement of LDS rounding the
75+ // size up?
76+ // Compute restriction based on LDS usage
77+ unsigned NumGroups = getLocalMemorySize () / (Bytes ? Bytes : 1u );
78+
79+ // This can be queried with more LDS than is possible, so just assume the
80+ // worst.
81+ if (NumGroups == 0 )
82+ return 1 ;
83+
84+ NumGroups = std::min (MaxWorkGroupsPerCu, NumGroups);
85+
86+ // Round to the number of waves per CU.
87+ const unsigned MaxGroupNumWaves = divideCeil (MaxWorkGroupSize, WaveSize);
88+ unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
89+
90+ // Number of waves per EU (SIMD).
91+ MaxWaves = divideCeil (MaxWaves, getEUsPerCU ());
92+
93+ // Clamp to the maximum possible number of waves.
94+ MaxWaves = std::min (MaxWaves, getMaxWavesPerEU ());
95+
96+ // FIXME: Needs to be a multiple of the group size?
97+ // MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
98+
99+ assert (MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU () &&
100+ " computed invalid occupancy" );
101+ return MaxWaves;
102+ }
103+
104+ unsigned
105+ AMDGPUSubtarget::getOccupancyWithLocalMemSize (const MachineFunction &MF) const {
106+ const auto *MFI = MF.getInfo <SIMachineFunctionInfo>();
107+ return getOccupancyWithLocalMemSize (MFI->getLDSSize (), MF.getFunction ());
108+ }
109+
58110std::pair<unsigned , unsigned > AMDGPUSubtarget::getOccupancyWithWorkGroupSizes (
59111 uint32_t LDSBytes, std::pair<unsigned , unsigned > FlatWorkGroupSizes) const {
60112
@@ -135,6 +187,87 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(
135187 std::clamp (divideCeil (MaxWavesPerCU, getEUsPerCU ()), 1U , WavesPerEU)};
136188}
137189
190+
191+ std::pair<unsigned , unsigned >
192+ AMDGPUSubtarget::getOccupancyWithWorkGroupSizes (uint32_t LDSBytes,
193+ const Function &F) const {
194+ // FIXME: We should take into account the LDS allocation granularity.
195+ const unsigned MaxWGsLDS = getLocalMemorySize () / std::max (LDSBytes, 1u );
196+
197+ // Queried LDS size may be larger than available on a CU, in which case we
198+ // consider the only achievable occupancy to be 1, in line with what we
199+ // consider the occupancy to be when the number of requested registers in a
200+ // particular bank is higher than the number of available ones in that bank.
201+ if (!MaxWGsLDS)
202+ return {1 , 1 };
203+
204+ const unsigned WaveSize = getWavefrontSize (), WavesPerEU = getMaxWavesPerEU ();
205+
206+ auto PropsFromWGSize = [=](unsigned WGSize)
207+ -> std::tuple<const unsigned , const unsigned , unsigned > {
208+ unsigned WavesPerWG = divideCeil (WGSize, WaveSize);
209+ unsigned WGsPerCU = std::min (getMaxWorkGroupsPerCU (WGSize), MaxWGsLDS);
210+ return {WavesPerWG, WGsPerCU, WavesPerWG * WGsPerCU};
211+ };
212+
213+ // The maximum group size will generally yield the minimum number of
214+ // workgroups, maximum number of waves, and minimum occupancy. The opposite is
215+ // generally true for the minimum group size. LDS or barrier ressource
216+ // limitations can flip those minimums/maximums.
217+ const auto [MinWGSize, MaxWGSize] = getFlatWorkGroupSizes (F);
218+ auto [MinWavesPerWG, MaxWGsPerCU, MaxWavesPerCU] = PropsFromWGSize (MinWGSize);
219+ auto [MaxWavesPerWG, MinWGsPerCU, MinWavesPerCU] = PropsFromWGSize (MaxWGSize);
220+
221+ // It is possible that we end up with flipped minimum and maximum number of
222+ // waves per CU when the number of minimum/maximum concurrent groups on the CU
223+ // is limited by LDS usage or barrier resources.
224+ if (MinWavesPerCU >= MaxWavesPerCU) {
225+ std::swap (MinWavesPerCU, MaxWavesPerCU);
226+ } else {
227+ const unsigned WaveSlotsPerCU = WavesPerEU * getEUsPerCU ();
228+
229+ // Look for a potential smaller group size than the maximum which decreases
230+ // the concurrent number of waves on the CU for the same number of
231+ // concurrent workgroups on the CU.
232+ unsigned MinWavesPerCUForWGSize =
233+ divideCeil (WaveSlotsPerCU, MinWGsPerCU + 1 ) * MinWGsPerCU;
234+ if (MinWavesPerCU > MinWavesPerCUForWGSize) {
235+ unsigned ExcessSlots = MinWavesPerCU - MinWavesPerCUForWGSize;
236+ if (unsigned ExcessSlotsPerWG = ExcessSlots / MinWGsPerCU) {
237+ // There may exist a smaller group size than the maximum that achieves
238+ // the minimum number of waves per CU. This group size is the largest
239+ // possible size that requires MaxWavesPerWG - E waves where E is
240+ // maximized under the following constraints.
241+ // 1. 0 <= E <= ExcessSlotsPerWG
242+ // 2. (MaxWavesPerWG - E) * WaveSize >= MinWGSize
243+ MinWavesPerCU -= MinWGsPerCU * std::min (ExcessSlotsPerWG,
244+ MaxWavesPerWG - MinWavesPerWG);
245+ }
246+ }
247+
248+ // Look for a potential larger group size than the minimum which increases
249+ // the concurrent number of waves on the CU for the same number of
250+ // concurrent workgroups on the CU.
251+ unsigned LeftoverSlots = WaveSlotsPerCU - MaxWGsPerCU * MinWavesPerWG;
252+ if (unsigned LeftoverSlotsPerWG = LeftoverSlots / MaxWGsPerCU) {
253+ // There may exist a larger group size than the minimum that achieves the
254+ // maximum number of waves per CU. This group size is the smallest
255+ // possible size that requires MinWavesPerWG + L waves where L is
256+ // maximized under the following constraints.
257+ // 1. 0 <= L <= LeftoverSlotsPerWG
258+ // 2. (MinWavesPerWG + L - 1) * WaveSize <= MaxWGSize
259+ MaxWavesPerCU += MaxWGsPerCU * std::min (LeftoverSlotsPerWG,
260+ ((MaxWGSize - 1 ) / WaveSize) + 1 -
261+ MinWavesPerWG);
262+ }
263+ }
264+
265+ // Return the minimum/maximum number of waves on any EU, assuming that all
266+ // wavefronts are spread across all EUs as evenly as possible.
267+ return {std::clamp (MinWavesPerCU / getEUsPerCU (), 1U , WavesPerEU),
268+ std::clamp (divideCeil (MaxWavesPerCU, getEUsPerCU ()), 1U , WavesPerEU)};
269+ }
270+
138271std::pair<unsigned , unsigned > AMDGPUSubtarget::getOccupancyWithWorkGroupSizes (
139272 const MachineFunction &MF) const {
140273 const auto *MFI = MF.getInfo <SIMachineFunctionInfo>();
0 commit comments