Skip to content

Commit f720d70

Browse files
AMDGPU/GlobalISel: Improve readanylane combines in regbanklegalize
1 parent fb315fb commit f720d70

File tree

6 files changed

+234
-186
lines changed

6 files changed

+234
-186
lines changed

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp

Lines changed: 201 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include "GCNSubtarget.h"
2424
#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
2525
#include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
26+
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
2627
#include "llvm/CodeGen/MachineFunctionPass.h"
2728
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
2829
#include "llvm/CodeGen/TargetPassConfig.h"
@@ -115,126 +116,222 @@ class AMDGPURegBankLegalizeCombiner {
115116
VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
116117
VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {};
117118

118-
bool isLaneMask(Register Reg) {
119-
const RegisterBank *RB = MRI.getRegBankOrNull(Reg);
120-
if (RB && RB->getID() == AMDGPU::VCCRegBankID)
121-
return true;
119+
bool isLaneMask(Register Reg);
120+
std::pair<MachineInstr *, Register> tryMatch(Register Src, unsigned Opcode);
121+
std::pair<GUnmerge *, int> tryMatchRALFromUnmerge(Register Src);
122+
Register getReadAnyLaneSrc(Register Src);
123+
void replaceRegWithOrBuildCopy(Register Dst, Register Src);
124+
bool tryEliminateReadAnyLane(MachineInstr &Copy);
125+
void tryCombineCopy(MachineInstr &MI);
126+
void tryCombineS1AnyExt(MachineInstr &MI);
127+
};
122128

123-
const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
124-
return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1);
125-
}
129+
bool AMDGPURegBankLegalizeCombiner::isLaneMask(Register Reg) {
130+
const RegisterBank *RB = MRI.getRegBankOrNull(Reg);
131+
if (RB && RB->getID() == AMDGPU::VCCRegBankID)
132+
return true;
126133

127-
void cleanUpAfterCombine(MachineInstr &MI, MachineInstr *Optional0) {
128-
MI.eraseFromParent();
129-
if (Optional0 && isTriviallyDead(*Optional0, MRI))
130-
Optional0->eraseFromParent();
131-
}
134+
const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
135+
return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1);
136+
}
132137

133-
std::pair<MachineInstr *, Register> tryMatch(Register Src, unsigned Opcode) {
134-
MachineInstr *MatchMI = MRI.getVRegDef(Src);
135-
if (MatchMI->getOpcode() != Opcode)
136-
return {nullptr, Register()};
137-
return {MatchMI, MatchMI->getOperand(1).getReg()};
138-
}
138+
std::pair<MachineInstr *, Register>
139+
AMDGPURegBankLegalizeCombiner::tryMatch(Register Src, unsigned Opcode) {
140+
MachineInstr *MatchMI = MRI.getVRegDef(Src);
141+
if (MatchMI->getOpcode() != Opcode)
142+
return {nullptr, Register()};
143+
return {MatchMI, MatchMI->getOperand(1).getReg()};
144+
}
139145

140-
void tryCombineCopy(MachineInstr &MI) {
141-
Register Dst = MI.getOperand(0).getReg();
142-
Register Src = MI.getOperand(1).getReg();
143-
// Skip copies of physical registers.
144-
if (!Dst.isVirtual() || !Src.isVirtual())
145-
return;
146-
147-
// This is a cross bank copy, sgpr S1 to lane mask.
148-
//
149-
// %Src:sgpr(s1) = G_TRUNC %TruncS32Src:sgpr(s32)
150-
// %Dst:lane-mask(s1) = COPY %Src:sgpr(s1)
151-
// ->
152-
// %Dst:lane-mask(s1) = G_AMDGPU_COPY_VCC_SCC %TruncS32Src:sgpr(s32)
153-
if (isLaneMask(Dst) && MRI.getRegBankOrNull(Src) == SgprRB) {
154-
auto [Trunc, TruncS32Src] = tryMatch(Src, AMDGPU::G_TRUNC);
155-
assert(Trunc && MRI.getType(TruncS32Src) == S32 &&
156-
"sgpr S1 must be result of G_TRUNC of sgpr S32");
157-
158-
B.setInstr(MI);
159-
// Ensure that truncated bits in BoolSrc are 0.
160-
auto One = B.buildConstant({SgprRB, S32}, 1);
161-
auto BoolSrc = B.buildAnd({SgprRB, S32}, TruncS32Src, One);
162-
B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {Dst}, {BoolSrc});
163-
cleanUpAfterCombine(MI, Trunc);
164-
return;
165-
}
146+
std::pair<GUnmerge *, int>
147+
AMDGPURegBankLegalizeCombiner::tryMatchRALFromUnmerge(Register Src) {
148+
MachineInstr *ReadAnyLane = MRI.getVRegDef(Src);
149+
if (ReadAnyLane->getOpcode() != AMDGPU::G_AMDGPU_READANYLANE)
150+
return {nullptr, -1};
151+
152+
Register RALSrc = ReadAnyLane->getOperand(1).getReg();
153+
if (auto *UnMerge = getOpcodeDef<GUnmerge>(RALSrc, MRI))
154+
return {UnMerge, UnMerge->findRegisterDefOperandIdx(RALSrc, nullptr)};
166155

167-
// Src = G_AMDGPU_READANYLANE RALSrc
168-
// Dst = COPY Src
169-
// ->
170-
// Dst = RALSrc
171-
if (MRI.getRegBankOrNull(Dst) == VgprRB &&
172-
MRI.getRegBankOrNull(Src) == SgprRB) {
173-
auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE);
174-
if (!RAL)
175-
return;
176-
177-
assert(MRI.getRegBank(RALSrc) == VgprRB);
178-
MRI.replaceRegWith(Dst, RALSrc);
179-
cleanUpAfterCombine(MI, RAL);
180-
return;
156+
return {nullptr, -1};
157+
}
158+
159+
Register AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrc(Register Src) {
160+
// Src = G_AMDGPU_READANYLANE RALSrc
161+
auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE);
162+
if (RAL)
163+
return RALSrc;
164+
165+
// LoVgpr, HiVgpr = G_UNMERGE_VALUES UnmergeSrc
166+
// LoSgpr = G_AMDGPU_READANYLANE LoVgpr
167+
// HiSgpr = G_AMDGPU_READANYLANE HiVgpr
168+
// Src G_MERGE_VALUES LoSgpr, HiSgpr
169+
auto *Merge = getOpcodeDef<GMergeLikeInstr>(Src, MRI);
170+
if (Merge) {
171+
unsigned NumElts = Merge->getNumSources();
172+
auto [Unmerge, Idx] = tryMatchRALFromUnmerge(Merge->getSourceReg(0));
173+
if (!Unmerge || Unmerge->getNumDefs() != NumElts || Idx != 0)
174+
return {};
175+
176+
// Check if all elements are from same unmerge and there is no shuffling.
177+
for (unsigned i = 1; i < NumElts; ++i) {
178+
auto [UnmergeI, IdxI] = tryMatchRALFromUnmerge(Merge->getSourceReg(i));
179+
if (UnmergeI != Unmerge || (unsigned)IdxI != i)
180+
return {};
181181
}
182+
return Unmerge->getSourceReg();
182183
}
183184

184-
void tryCombineS1AnyExt(MachineInstr &MI) {
185-
// %Src:sgpr(S1) = G_TRUNC %TruncSrc
186-
// %Dst = G_ANYEXT %Src:sgpr(S1)
187-
// ->
188-
// %Dst = G_... %TruncSrc
189-
Register Dst = MI.getOperand(0).getReg();
190-
Register Src = MI.getOperand(1).getReg();
191-
if (MRI.getType(Src) != S1)
192-
return;
193-
194-
auto [Trunc, TruncSrc] = tryMatch(Src, AMDGPU::G_TRUNC);
195-
if (!Trunc)
196-
return;
197-
198-
LLT DstTy = MRI.getType(Dst);
199-
LLT TruncSrcTy = MRI.getType(TruncSrc);
200-
201-
if (DstTy == TruncSrcTy) {
202-
MRI.replaceRegWith(Dst, TruncSrc);
203-
cleanUpAfterCombine(MI, Trunc);
204-
return;
205-
}
185+
// ..., VgprI, ... = G_UNMERGE_VALUES VgprLarge
186+
// SgprI = G_AMDGPU_READANYLANE VgprI
187+
// SgprLarge G_MERGE_VALUES ..., SgprI, ...
188+
// ..., Src, ... = G_UNMERGE_VALUES SgprLarge
189+
auto *UnMerge = getOpcodeDef<GUnmerge>(Src, MRI);
190+
if (!UnMerge)
191+
return {};
192+
193+
int Idx = UnMerge->findRegisterDefOperandIdx(Src, nullptr);
194+
Merge = getOpcodeDef<GMergeLikeInstr>(UnMerge->getSourceReg(), MRI);
195+
if (!Merge)
196+
return {};
197+
198+
auto [RALElt, RALEltSrc] =
199+
tryMatch(Merge->getSourceReg(Idx), AMDGPU::G_AMDGPU_READANYLANE);
200+
if (RALElt)
201+
return RALEltSrc;
202+
203+
return {};
204+
}
205+
206+
void AMDGPURegBankLegalizeCombiner::replaceRegWithOrBuildCopy(Register Dst,
207+
Register Src) {
208+
if (Dst.isVirtual())
209+
MRI.replaceRegWith(Dst, Src);
210+
else
211+
B.buildCopy(Dst, Src);
212+
}
213+
214+
bool AMDGPURegBankLegalizeCombiner::tryEliminateReadAnyLane(
215+
MachineInstr &Copy) {
216+
Register Dst = Copy.getOperand(0).getReg();
217+
Register Src = Copy.getOperand(1).getReg();
218+
if (!Src.isVirtual())
219+
return false;
220+
221+
Register RALDst = Src;
222+
MachineInstr &SrcMI = *MRI.getVRegDef(Src);
223+
if (SrcMI.getOpcode() == AMDGPU::G_BITCAST)
224+
RALDst = SrcMI.getOperand(1).getReg();
225+
226+
Register RALSrc = getReadAnyLaneSrc(RALDst);
227+
if (!RALSrc)
228+
return false;
229+
230+
B.setInstr(Copy);
231+
if (SrcMI.getOpcode() != AMDGPU::G_BITCAST) {
232+
// Src = READANYLANE RALSrc Src = READANYLANE RALSrc
233+
// Dst = Copy Src $Dst = Copy Src
234+
// -> ->
235+
// Dst = RALSrc $Dst = Copy RALSrc
236+
replaceRegWithOrBuildCopy(Dst, RALSrc);
237+
} else {
238+
// RALDst = READANYLANE RALSrc RALDst = READANYLANE RALSrc
239+
// Src = G_BITCAST RALDst Src = G_BITCAST RALDst
240+
// Dst = Copy Src Dst = Copy Src
241+
// -> ->
242+
// NewVgpr = G_BITCAST RALDst NewVgpr = G_BITCAST RALDst
243+
// Dst = NewVgpr $Dst = Copy NewVgpr
244+
auto Bitcast = B.buildBitcast({VgprRB, MRI.getType(Src)}, RALSrc);
245+
replaceRegWithOrBuildCopy(Dst, Bitcast.getReg(0));
246+
}
247+
248+
eraseInstr(Copy, MRI, nullptr);
249+
return true;
250+
}
251+
252+
void AMDGPURegBankLegalizeCombiner::tryCombineCopy(MachineInstr &MI) {
253+
if (tryEliminateReadAnyLane(MI))
254+
return;
255+
256+
Register Dst = MI.getOperand(0).getReg();
257+
Register Src = MI.getOperand(1).getReg();
258+
// Skip copies of physical registers.
259+
if (!Dst.isVirtual() || !Src.isVirtual())
260+
return;
261+
262+
// This is a cross bank copy, sgpr S1 to lane mask.
263+
//
264+
// %Src:sgpr(s1) = G_TRUNC %TruncS32Src:sgpr(s32)
265+
// %Dst:lane-mask(s1) = COPY %Src:sgpr(s1)
266+
// ->
267+
// %Dst:lane-mask(s1) = G_AMDGPU_COPY_VCC_SCC %TruncS32Src:sgpr(s32)
268+
if (isLaneMask(Dst) && MRI.getRegBankOrNull(Src) == SgprRB) {
269+
auto [Trunc, TruncS32Src] = tryMatch(Src, AMDGPU::G_TRUNC);
270+
assert(Trunc && MRI.getType(TruncS32Src) == S32 &&
271+
"sgpr S1 must be result of G_TRUNC of sgpr S32");
206272

207273
B.setInstr(MI);
274+
// Ensure that truncated bits in BoolSrc are 0.
275+
auto One = B.buildConstant({SgprRB, S32}, 1);
276+
auto BoolSrc = B.buildAnd({SgprRB, S32}, TruncS32Src, One);
277+
B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {Dst}, {BoolSrc});
278+
eraseInstr(MI, MRI, nullptr);
279+
}
280+
}
208281

209-
if (DstTy == S32 && TruncSrcTy == S64) {
210-
auto Unmerge = B.buildUnmerge({SgprRB, S32}, TruncSrc);
211-
MRI.replaceRegWith(Dst, Unmerge.getReg(0));
212-
cleanUpAfterCombine(MI, Trunc);
213-
return;
214-
}
282+
void AMDGPURegBankLegalizeCombiner::tryCombineS1AnyExt(MachineInstr &MI) {
283+
// %Src:sgpr(S1) = G_TRUNC %TruncSrc
284+
// %Dst = G_ANYEXT %Src:sgpr(S1)
285+
// ->
286+
// %Dst = G_... %TruncSrc
287+
Register Dst = MI.getOperand(0).getReg();
288+
Register Src = MI.getOperand(1).getReg();
289+
if (MRI.getType(Src) != S1)
290+
return;
291+
292+
auto [Trunc, TruncSrc] = tryMatch(Src, AMDGPU::G_TRUNC);
293+
if (!Trunc)
294+
return;
295+
296+
LLT DstTy = MRI.getType(Dst);
297+
LLT TruncSrcTy = MRI.getType(TruncSrc);
298+
299+
if (DstTy == TruncSrcTy) {
300+
MRI.replaceRegWith(Dst, TruncSrc);
301+
eraseInstr(MI, MRI, nullptr);
302+
return;
303+
}
215304

216-
if (DstTy == S64 && TruncSrcTy == S32) {
217-
B.buildMergeLikeInstr(MI.getOperand(0).getReg(),
218-
{TruncSrc, B.buildUndef({SgprRB, S32})});
219-
cleanUpAfterCombine(MI, Trunc);
220-
return;
221-
}
305+
B.setInstr(MI);
222306

223-
if (DstTy == S32 && TruncSrcTy == S16) {
224-
B.buildAnyExt(Dst, TruncSrc);
225-
cleanUpAfterCombine(MI, Trunc);
226-
return;
227-
}
307+
if (DstTy == S32 && TruncSrcTy == S64) {
308+
auto Unmerge = B.buildUnmerge({SgprRB, S32}, TruncSrc);
309+
MRI.replaceRegWith(Dst, Unmerge.getReg(0));
310+
eraseInstr(MI, MRI, nullptr);
311+
return;
312+
}
228313

229-
if (DstTy == S16 && TruncSrcTy == S32) {
230-
B.buildTrunc(Dst, TruncSrc);
231-
cleanUpAfterCombine(MI, Trunc);
232-
return;
233-
}
314+
if (DstTy == S64 && TruncSrcTy == S32) {
315+
B.buildMergeLikeInstr(MI.getOperand(0).getReg(),
316+
{TruncSrc, B.buildUndef({SgprRB, S32})});
317+
eraseInstr(MI, MRI, nullptr);
318+
return;
319+
}
234320

235-
llvm_unreachable("missing anyext + trunc combine");
321+
if (DstTy == S32 && TruncSrcTy == S16) {
322+
B.buildAnyExt(Dst, TruncSrc);
323+
eraseInstr(MI, MRI, nullptr);
324+
return;
236325
}
237-
};
326+
327+
if (DstTy == S16 && TruncSrcTy == S32) {
328+
B.buildTrunc(Dst, TruncSrc);
329+
eraseInstr(MI, MRI, nullptr);
330+
return;
331+
}
332+
333+
llvm_unreachable("missing anyext + trunc combine");
334+
}
238335

239336
// Search through MRI for virtual registers with sgpr register bank and S1 LLT.
240337
[[maybe_unused]] static Register getAnySgprS1(const MachineRegisterInfo &MRI) {

0 commit comments

Comments
 (0)