Skip to content

Commit e935fb9

Browse files
committed
Remat for the first loop
Change-Id: Ic390b34e4c921325607d245a2cba4fb14f5b35ee
1 parent e12cbd8 commit e935fb9

File tree

9 files changed

+2194
-715
lines changed

9 files changed

+2194
-715
lines changed

llvm/lib/CodeGen/RegAllocGreedy.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,12 @@ static cl::opt<bool> GreedyReverseLocalAssignment(
132132
"shorter local live ranges will tend to be allocated first"),
133133
cl::Hidden);
134134

135+
static cl::opt<bool> ForceLocalAssignment(
136+
"force-local-assignment",
137+
cl::desc("Force allocation order of local live ranges, such that "
138+
"shorter local live ranges will tend to be allocated first"),
139+
cl::Hidden);
140+
135141
static cl::opt<unsigned> SplitThresholdForRegWithHint(
136142
"split-threshold-for-reg-with-hint",
137143
cl::desc("The threshold for splitting a virtual register with a hint, in "
@@ -456,6 +462,7 @@ unsigned DefaultPriorityAdvisor::getPriority(const LiveInterval &LI) const {
456462
(Size / SlotIndex::InstrDist) >
457463
(2 * RegClassInfo.getNumAllocatableRegs(&RC)));
458464
unsigned GlobalBit = 0;
465+
ForceGlobal &= !ForceLocalAssignment;
459466

460467
if (Stage == RS_Assign && !ForceGlobal && !LI.empty() &&
461468
LIS->intervalIsInOneMBB(LI)) {

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,58 @@ AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
5555
return getLocalMemorySize() / WorkGroupsPerCU;
5656
}
5757

58+
// FIXME: Should return min,max range.
59+
//
60+
// Returns the maximum occupancy, in number of waves per SIMD / EU, that can
61+
// be achieved when only the given function is running on the machine; and
62+
// taking into account the overall number of wave slots, the (maximum) workgroup
63+
// size, and the per-workgroup LDS allocation size.
64+
unsigned
65+
AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
66+
const Function &F) const {
67+
const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
68+
const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
69+
if (!MaxWorkGroupsPerCu)
70+
return 0;
71+
72+
const unsigned WaveSize = getWavefrontSize();
73+
74+
// FIXME: Do we need to account for alignment requirement of LDS rounding the
75+
// size up?
76+
// Compute restriction based on LDS usage
77+
unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
78+
79+
// This can be queried with more LDS than is possible, so just assume the
80+
// worst.
81+
if (NumGroups == 0)
82+
return 1;
83+
84+
NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
85+
86+
// Round to the number of waves per CU.
87+
const unsigned MaxGroupNumWaves = divideCeil(MaxWorkGroupSize, WaveSize);
88+
unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
89+
90+
// Number of waves per EU (SIMD).
91+
MaxWaves = divideCeil(MaxWaves, getEUsPerCU());
92+
93+
// Clamp to the maximum possible number of waves.
94+
MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
95+
96+
// FIXME: Needs to be a multiple of the group size?
97+
// MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
98+
99+
assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
100+
"computed invalid occupancy");
101+
return MaxWaves;
102+
}
103+
104+
unsigned
105+
AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
106+
const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
107+
return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
108+
}
109+
58110
std::pair<unsigned, unsigned> AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(
59111
uint32_t LDSBytes, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
60112

@@ -135,6 +187,87 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(
135187
std::clamp(divideCeil(MaxWavesPerCU, getEUsPerCU()), 1U, WavesPerEU)};
136188
}
137189

190+
191+
std::pair<unsigned, unsigned>
192+
AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(uint32_t LDSBytes,
193+
const Function &F) const {
194+
// FIXME: We should take into account the LDS allocation granularity.
195+
const unsigned MaxWGsLDS = getLocalMemorySize() / std::max(LDSBytes, 1u);
196+
197+
// Queried LDS size may be larger than available on a CU, in which case we
198+
// consider the only achievable occupancy to be 1, in line with what we
199+
// consider the occupancy to be when the number of requested registers in a
200+
// particular bank is higher than the number of available ones in that bank.
201+
if (!MaxWGsLDS)
202+
return {1, 1};
203+
204+
const unsigned WaveSize = getWavefrontSize(), WavesPerEU = getMaxWavesPerEU();
205+
206+
auto PropsFromWGSize = [=](unsigned WGSize)
207+
-> std::tuple<const unsigned, const unsigned, unsigned> {
208+
unsigned WavesPerWG = divideCeil(WGSize, WaveSize);
209+
unsigned WGsPerCU = std::min(getMaxWorkGroupsPerCU(WGSize), MaxWGsLDS);
210+
return {WavesPerWG, WGsPerCU, WavesPerWG * WGsPerCU};
211+
};
212+
213+
// The maximum group size will generally yield the minimum number of
214+
// workgroups, maximum number of waves, and minimum occupancy. The opposite is
215+
// generally true for the minimum group size. LDS or barrier ressource
216+
// limitations can flip those minimums/maximums.
217+
const auto [MinWGSize, MaxWGSize] = getFlatWorkGroupSizes(F);
218+
auto [MinWavesPerWG, MaxWGsPerCU, MaxWavesPerCU] = PropsFromWGSize(MinWGSize);
219+
auto [MaxWavesPerWG, MinWGsPerCU, MinWavesPerCU] = PropsFromWGSize(MaxWGSize);
220+
221+
// It is possible that we end up with flipped minimum and maximum number of
222+
// waves per CU when the number of minimum/maximum concurrent groups on the CU
223+
// is limited by LDS usage or barrier resources.
224+
if (MinWavesPerCU >= MaxWavesPerCU) {
225+
std::swap(MinWavesPerCU, MaxWavesPerCU);
226+
} else {
227+
const unsigned WaveSlotsPerCU = WavesPerEU * getEUsPerCU();
228+
229+
// Look for a potential smaller group size than the maximum which decreases
230+
// the concurrent number of waves on the CU for the same number of
231+
// concurrent workgroups on the CU.
232+
unsigned MinWavesPerCUForWGSize =
233+
divideCeil(WaveSlotsPerCU, MinWGsPerCU + 1) * MinWGsPerCU;
234+
if (MinWavesPerCU > MinWavesPerCUForWGSize) {
235+
unsigned ExcessSlots = MinWavesPerCU - MinWavesPerCUForWGSize;
236+
if (unsigned ExcessSlotsPerWG = ExcessSlots / MinWGsPerCU) {
237+
// There may exist a smaller group size than the maximum that achieves
238+
// the minimum number of waves per CU. This group size is the largest
239+
// possible size that requires MaxWavesPerWG - E waves where E is
240+
// maximized under the following constraints.
241+
// 1. 0 <= E <= ExcessSlotsPerWG
242+
// 2. (MaxWavesPerWG - E) * WaveSize >= MinWGSize
243+
MinWavesPerCU -= MinWGsPerCU * std::min(ExcessSlotsPerWG,
244+
MaxWavesPerWG - MinWavesPerWG);
245+
}
246+
}
247+
248+
// Look for a potential larger group size than the minimum which increases
249+
// the concurrent number of waves on the CU for the same number of
250+
// concurrent workgroups on the CU.
251+
unsigned LeftoverSlots = WaveSlotsPerCU - MaxWGsPerCU * MinWavesPerWG;
252+
if (unsigned LeftoverSlotsPerWG = LeftoverSlots / MaxWGsPerCU) {
253+
// There may exist a larger group size than the minimum that achieves the
254+
// maximum number of waves per CU. This group size is the smallest
255+
// possible size that requires MinWavesPerWG + L waves where L is
256+
// maximized under the following constraints.
257+
// 1. 0 <= L <= LeftoverSlotsPerWG
258+
// 2. (MinWavesPerWG + L - 1) * WaveSize <= MaxWGSize
259+
MaxWavesPerCU += MaxWGsPerCU * std::min(LeftoverSlotsPerWG,
260+
((MaxWGSize - 1) / WaveSize) + 1 -
261+
MinWavesPerWG);
262+
}
263+
}
264+
265+
// Return the minimum/maximum number of waves on any EU, assuming that all
266+
// wavefronts are spread across all EUs as evenly as possible.
267+
return {std::clamp(MinWavesPerCU / getEUsPerCU(), 1U, WavesPerEU),
268+
std::clamp(divideCeil(MaxWavesPerCU, getEUsPerCU()), 1U, WavesPerEU)};
269+
}
270+
138271
std::pair<unsigned, unsigned> AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(
139272
const MachineFunction &MF) const {
140273
const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,11 @@ class AMDGPUSubtarget {
8585
static const AMDGPUSubtarget &get(const TargetMachine &TM,
8686
const Function &F);
8787

88+
unsigned getOccupancyWithLocalMemSize(uint32_t Bytes,
89+
const Function &F) const;
90+
91+
unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const;
92+
8893
/// \returns Default range flat work group size for a calling convention.
8994
std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const;
9095

@@ -143,9 +148,8 @@ class AMDGPUSubtarget {
143148
/// This notably depends on the range of allowed flat group sizes for the
144149
/// function and hardware characteristics.
145150
std::pair<unsigned, unsigned>
146-
getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const {
147-
return getOccupancyWithWorkGroupSizes(LDSBytes, getFlatWorkGroupSizes(F));
148-
}
151+
getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const;
152+
149153

150154
/// Overload which uses the specified values for the flat work group sizes,
151155
/// rather than querying the function itself. \p FlatWorkGroupSizes should

0 commit comments

Comments
 (0)