Skip to content

Commit 1f0f347

Browse files
authored
[AMDGPU] High VGPR lowering on gfx1250 (#156965)
1 parent c689919 commit 1f0f347

File tree

13 files changed

+1378
-4
lines changed

13 files changed

+1378
-4
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -501,6 +501,9 @@ extern char &SIModeRegisterID;
501501
void initializeAMDGPUInsertDelayAluLegacyPass(PassRegistry &);
502502
extern char &AMDGPUInsertDelayAluID;
503503

504+
void initializeAMDGPULowerVGPREncodingPass(PassRegistry &);
505+
extern char &AMDGPULowerVGPREncodingID;
506+
504507
void initializeSIInsertHardClausesLegacyPass(PassRegistry &);
505508
extern char &SIInsertHardClausesID;
506509

Lines changed: 354 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,354 @@
1+
//===- AMDGPULowerVGPREncoding.cpp - lower VGPRs above v255 ---------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
/// \file
10+
/// Lower VGPRs above first 256 on gfx1250.
11+
///
12+
/// The pass scans used VGPRs and inserts S_SET_VGPR_MSB instructions to switch
13+
/// VGPR addressing mode. The mode change is effective until the next change.
14+
/// This instruction provides high bits of a VGPR address for four of the
15+
/// operands: vdst, src0, src1, and src2, or other 4 operands depending on the
16+
/// instruction encoding. If bits are set they are added as MSB to the
17+
/// corresponding operand VGPR number.
18+
///
19+
/// There is no need to replace actual register operands because encoding of the
20+
/// high and low VGPRs is the same. I.e. v0 has the encoding 0x100, so does
21+
/// v256. v1 has the encoding 0x101 and v257 has the same encoding. So high
22+
/// VGPRs will survive until actual encoding and will result in a same actual
23+
/// bit encoding.
24+
///
25+
/// As a result the pass only inserts S_SET_VGPR_MSB to provide an actual offset
26+
/// to a VGPR address of the subseqent instructions. The InstPrinter will take
27+
/// care of the printing a low VGPR instead of a high one. In prinicple this
28+
/// shall be viable to print actual high VGPR numbers, but that would disagree
29+
/// with a disasm printing and create a situation where asm text is not
30+
/// deterministic.
31+
///
32+
/// This pass creates a convention where non-fall through basic blocks shall
33+
/// start with all 4 MSBs zero. Otherwise a disassembly would not be readable.
34+
/// An optimization here is possible but deemed not desirable because of the
35+
/// readbility concerns.
36+
///
37+
/// Consequentially the ABI is set to expect all 4 MSBs to be zero on entry.
38+
/// The pass must run very late in the pipeline to make sure no changes to VGPR
39+
/// operands will be made after it.
40+
//
41+
//===----------------------------------------------------------------------===//
42+
43+
#include "AMDGPU.h"
44+
#include "GCNSubtarget.h"
45+
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
46+
#include "SIInstrInfo.h"
47+
#include "llvm/ADT/PackedVector.h"
48+
49+
using namespace llvm;
50+
51+
#define DEBUG_TYPE "amdgpu-lower-vgpr-encoding"
52+
53+
namespace {
54+
55+
class AMDGPULowerVGPREncoding : public MachineFunctionPass {
56+
static constexpr unsigned OpNum = 4;
57+
static constexpr unsigned BitsPerField = 2;
58+
static constexpr unsigned NumFields = 4;
59+
static constexpr unsigned FieldMask = (1 << BitsPerField) - 1;
60+
using ModeType = PackedVector<unsigned, BitsPerField,
61+
std::bitset<BitsPerField * NumFields>>;
62+
63+
class ModeTy : public ModeType {
64+
public:
65+
// bitset constructor will set all bits to zero
66+
ModeTy() : ModeType(0) {}
67+
68+
operator int64_t() const { return raw_bits().to_ulong(); }
69+
70+
static ModeTy fullMask() {
71+
ModeTy M;
72+
M.raw_bits().flip();
73+
return M;
74+
}
75+
};
76+
77+
public:
78+
static char ID;
79+
80+
AMDGPULowerVGPREncoding() : MachineFunctionPass(ID) {}
81+
82+
void getAnalysisUsage(AnalysisUsage &AU) const override {
83+
AU.setPreservesCFG();
84+
MachineFunctionPass::getAnalysisUsage(AU);
85+
}
86+
87+
bool runOnMachineFunction(MachineFunction &MF) override;
88+
89+
private:
90+
const SIInstrInfo *TII;
91+
const SIRegisterInfo *TRI;
92+
93+
/// Most recent s_set_* instruction.
94+
MachineInstr *MostRecentModeSet;
95+
96+
/// Whether the current mode is known.
97+
bool CurrentModeKnown;
98+
99+
/// Current mode bits.
100+
ModeTy CurrentMode;
101+
102+
/// Current mask of mode bits that instructions since MostRecentModeSet care
103+
/// about.
104+
ModeTy CurrentMask;
105+
106+
/// Number of current hard clause instructions.
107+
unsigned ClauseLen;
108+
109+
/// Number of hard clause instructions remaining.
110+
unsigned ClauseRemaining;
111+
112+
/// Clause group breaks.
113+
unsigned ClauseBreaks;
114+
115+
/// Last hard clause instruction.
116+
MachineInstr *Clause;
117+
118+
/// Insert mode change before \p I. \returns true if mode was changed.
119+
bool setMode(ModeTy NewMode, ModeTy Mask, MachineInstr *I);
120+
121+
/// Reset mode to default.
122+
void resetMode(MachineInstr *I) { setMode(ModeTy(), ModeTy::fullMask(), I); }
123+
124+
/// If \p MO references VGPRs, return the MSBs. Otherwise, return nullopt.
125+
std::optional<unsigned> getMSBs(const MachineOperand &MO) const;
126+
127+
/// Handle single \p MI. \return true if changed.
128+
bool runOnMachineInstr(MachineInstr &MI);
129+
130+
/// Compute the mode and mode mask for a single \p MI given \p Ops operands
131+
/// bit mapping. Optionally takes second array \p Ops2 for VOPD.
132+
/// If provided and an operand from \p Ops is not a VGPR, then \p Ops2
133+
/// is checked.
134+
void computeMode(ModeTy &NewMode, ModeTy &Mask, MachineInstr &MI,
135+
const AMDGPU::OpName Ops[OpNum],
136+
const AMDGPU::OpName *Ops2 = nullptr);
137+
138+
/// Check if an instruction \p I is within a clause and returns a suitable
139+
/// iterator to insert mode change. It may also modify the S_CLAUSE
140+
/// instruction to extend it or drop the clause if it cannot be adjusted.
141+
MachineInstr *handleClause(MachineInstr *I);
142+
};
143+
144+
bool AMDGPULowerVGPREncoding::setMode(ModeTy NewMode, ModeTy Mask,
145+
MachineInstr *I) {
146+
assert((NewMode.raw_bits() & ~Mask.raw_bits()).none());
147+
148+
if (CurrentModeKnown) {
149+
auto Delta = NewMode.raw_bits() ^ CurrentMode.raw_bits();
150+
151+
if ((Delta & Mask.raw_bits()).none()) {
152+
CurrentMask |= Mask;
153+
return false;
154+
}
155+
156+
if (MostRecentModeSet && (Delta & CurrentMask.raw_bits()).none()) {
157+
CurrentMode |= NewMode;
158+
CurrentMask |= Mask;
159+
160+
MostRecentModeSet->getOperand(0).setImm(CurrentMode);
161+
return true;
162+
}
163+
}
164+
165+
I = handleClause(I);
166+
MostRecentModeSet =
167+
BuildMI(*I->getParent(), I, {}, TII->get(AMDGPU::S_SET_VGPR_MSB))
168+
.addImm(NewMode);
169+
170+
CurrentMode = NewMode;
171+
CurrentMask = Mask;
172+
CurrentModeKnown = true;
173+
return true;
174+
}
175+
176+
std::optional<unsigned>
177+
AMDGPULowerVGPREncoding::getMSBs(const MachineOperand &MO) const {
178+
if (!MO.isReg())
179+
return std::nullopt;
180+
181+
MCRegister Reg = MO.getReg();
182+
const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
183+
if (!RC || !TRI->isVGPRClass(RC))
184+
return std::nullopt;
185+
186+
unsigned Idx = TRI->getHWRegIndex(Reg);
187+
return Idx >> 8;
188+
}
189+
190+
void AMDGPULowerVGPREncoding::computeMode(ModeTy &NewMode, ModeTy &Mask,
191+
MachineInstr &MI,
192+
const AMDGPU::OpName Ops[OpNum],
193+
const AMDGPU::OpName *Ops2) {
194+
NewMode = {};
195+
Mask = {};
196+
197+
for (unsigned I = 0; I < OpNum; ++I) {
198+
MachineOperand *Op = TII->getNamedOperand(MI, Ops[I]);
199+
200+
std::optional<unsigned> MSBits;
201+
if (Op)
202+
MSBits = getMSBs(*Op);
203+
204+
#if !defined(NDEBUG)
205+
if (MSBits.has_value() && Ops2) {
206+
auto Op2 = TII->getNamedOperand(MI, Ops2[I]);
207+
if (Op2) {
208+
std::optional<unsigned> MSBits2;
209+
MSBits2 = getMSBs(*Op2);
210+
if (MSBits2.has_value() && MSBits != MSBits2)
211+
llvm_unreachable("Invalid VOPD pair was created");
212+
}
213+
}
214+
#endif
215+
216+
if (!MSBits.has_value() && Ops2) {
217+
Op = TII->getNamedOperand(MI, Ops2[I]);
218+
if (Op)
219+
MSBits = getMSBs(*Op);
220+
}
221+
222+
if (!MSBits.has_value())
223+
continue;
224+
225+
// Skip tied uses of src2 of VOP2, these will be handled along with defs and
226+
// only vdst bit affects these operands. We cannot skip tied uses of VOP3,
227+
// these uses are real even if must match the vdst.
228+
if (Ops[I] == AMDGPU::OpName::src2 && !Op->isDef() && Op->isTied() &&
229+
(SIInstrInfo::isVOP2(MI) ||
230+
(SIInstrInfo::isVOP3(MI) &&
231+
TII->hasVALU32BitEncoding(MI.getOpcode()))))
232+
continue;
233+
234+
NewMode[I] = MSBits.value();
235+
Mask[I] = FieldMask;
236+
}
237+
}
238+
239+
bool AMDGPULowerVGPREncoding::runOnMachineInstr(MachineInstr &MI) {
240+
auto Ops = AMDGPU::getVGPRLoweringOperandTables(MI.getDesc());
241+
if (Ops.first) {
242+
ModeTy NewMode, Mask;
243+
computeMode(NewMode, Mask, MI, Ops.first, Ops.second);
244+
return setMode(NewMode, Mask, &MI);
245+
}
246+
assert(!TII->hasVGPRUses(MI) || MI.isMetaInstruction() || MI.isPseudo());
247+
248+
return false;
249+
}
250+
251+
MachineInstr *AMDGPULowerVGPREncoding::handleClause(MachineInstr *I) {
252+
if (!ClauseRemaining)
253+
return I;
254+
255+
// A clause cannot start with a special instruction, place it right before
256+
// the clause.
257+
if (ClauseRemaining == ClauseLen) {
258+
I = Clause->getPrevNode();
259+
assert(I->isBundle());
260+
return I;
261+
}
262+
263+
// If a clause defines breaks each group cannot start with a mode change.
264+
// just drop the clause.
265+
if (ClauseBreaks) {
266+
Clause->eraseFromBundle();
267+
ClauseRemaining = 0;
268+
return I;
269+
}
270+
271+
// Otherwise adjust a number of instructions in the clause if it fits.
272+
// If it does not clause will just become shorter. Since the length
273+
// recorded in the clause is one less, increment the length after the
274+
// update. Note that SIMM16[5:0] must be 1-62, not 0 or 63.
275+
if (ClauseLen < 63)
276+
Clause->getOperand(0).setImm(ClauseLen | (ClauseBreaks << 8));
277+
278+
++ClauseLen;
279+
280+
return I;
281+
}
282+
283+
bool AMDGPULowerVGPREncoding::runOnMachineFunction(MachineFunction &MF) {
284+
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
285+
if (!ST.has1024AddressableVGPRs())
286+
return false;
287+
288+
TII = ST.getInstrInfo();
289+
TRI = ST.getRegisterInfo();
290+
291+
bool Changed = false;
292+
ClauseLen = ClauseRemaining = 0;
293+
CurrentMode.reset();
294+
CurrentMask.reset();
295+
CurrentModeKnown = true;
296+
for (auto &MBB : MF) {
297+
MostRecentModeSet = nullptr;
298+
299+
for (auto &MI : llvm::make_early_inc_range(MBB.instrs())) {
300+
if (MI.isMetaInstruction())
301+
continue;
302+
303+
if (MI.isTerminator() || MI.isCall()) {
304+
if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
305+
MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
306+
CurrentMode.reset();
307+
CurrentModeKnown = true;
308+
} else
309+
resetMode(&MI);
310+
continue;
311+
}
312+
313+
if (MI.isInlineAsm()) {
314+
if (TII->hasVGPRUses(MI))
315+
resetMode(&MI);
316+
continue;
317+
}
318+
319+
if (MI.getOpcode() == AMDGPU::S_CLAUSE) {
320+
assert(!ClauseRemaining && "Nested clauses are not supported");
321+
ClauseLen = MI.getOperand(0).getImm();
322+
ClauseBreaks = (ClauseLen >> 8) & 15;
323+
ClauseLen = ClauseRemaining = (ClauseLen & 63) + 1;
324+
Clause = &MI;
325+
continue;
326+
}
327+
328+
Changed |= runOnMachineInstr(MI);
329+
330+
if (ClauseRemaining)
331+
--ClauseRemaining;
332+
}
333+
334+
// If we're falling through to a block that has at least one other
335+
// predecessor, we no longer know the mode.
336+
MachineBasicBlock *Next = MBB.getNextNode();
337+
if (Next && Next->pred_size() >= 2 &&
338+
llvm::is_contained(Next->predecessors(), &MBB)) {
339+
if (CurrentMode.raw_bits().any())
340+
CurrentModeKnown = false;
341+
}
342+
}
343+
344+
return Changed;
345+
}
346+
347+
} // namespace
348+
349+
char AMDGPULowerVGPREncoding::ID = 0;
350+
351+
char &llvm::AMDGPULowerVGPREncodingID = AMDGPULowerVGPREncoding::ID;
352+
353+
INITIALIZE_PASS(AMDGPULowerVGPREncoding, DEBUG_TYPE,
354+
"AMDGPU Lower VGPR Encoding", false, false)

llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -373,6 +373,13 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
373373
MF->getInfo<SIMachineFunctionInfo>(),
374374
*OutStreamer);
375375

376+
if (isVerbose() && MI->getOpcode() == AMDGPU::S_SET_VGPR_MSB) {
377+
unsigned V = MI->getOperand(0).getImm();
378+
OutStreamer->AddComment(
379+
" msbs: dst=" + Twine(V >> 6) + " src0=" + Twine(V & 3) +
380+
" src1=" + Twine((V >> 2) & 3) + " src2=" + Twine((V >> 4) & 3));
381+
}
382+
376383
MCInst TmpInst;
377384
MCInstLowering.lower(MI, TmpInst);
378385
EmitToStreamer(*OutStreamer, TmpInst);

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -584,6 +584,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
584584
initializeAMDGPURewriteUndefForPHILegacyPass(*PR);
585585
initializeSIAnnotateControlFlowLegacyPass(*PR);
586586
initializeAMDGPUInsertDelayAluLegacyPass(*PR);
587+
initializeAMDGPULowerVGPREncodingPass(*PR);
587588
initializeSIInsertHardClausesLegacyPass(*PR);
588589
initializeSIInsertWaitcntsLegacyPass(*PR);
589590
initializeSIModeRegisterLegacyPass(*PR);
@@ -1799,6 +1800,8 @@ void GCNPassConfig::addPreEmitPass() {
17991800

18001801
addPass(&AMDGPUWaitSGPRHazardsLegacyID);
18011802

1803+
addPass(&AMDGPULowerVGPREncodingID);
1804+
18021805
if (isPassEnabled(EnableInsertDelayAlu, CodeGenOptLevel::Less))
18031806
addPass(&AMDGPUInsertDelayAluID);
18041807

0 commit comments

Comments
 (0)