Skip to content

Commit e01ddfa

Browse files
committed
FPInfo: AMDGPURegBankLegalize
1 parent 6e2ec24 commit e01ddfa

File tree

3 files changed

+44
-31
lines changed

3 files changed

+44
-31
lines changed

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include "AMDGPUGlobalISelUtils.h"
2222
#include "AMDGPURegBankLegalizeHelper.h"
2323
#include "GCNSubtarget.h"
24+
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
2425
#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
2526
#include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
2627
#include "llvm/CodeGen/MachineFunctionPass.h"
@@ -106,10 +107,10 @@ class AMDGPURegBankLegalizeCombiner {
106107
const RegisterBank *VgprRB;
107108
const RegisterBank *VccRB;
108109

109-
static constexpr LLT S1 = LLT::scalar(1);
110-
static constexpr LLT S16 = LLT::scalar(16);
111-
static constexpr LLT S32 = LLT::scalar(32);
112-
static constexpr LLT S64 = LLT::scalar(64);
110+
static constexpr LLT I1 = LLT::integer(1);
111+
static constexpr LLT I16 = LLT::integer(16);
112+
static constexpr LLT I32 = LLT::integer(32);
113+
static constexpr LLT I64 = LLT::integer(64);
113114

114115
public:
115116
AMDGPURegBankLegalizeCombiner(MachineIRBuilder &B, const SIRegisterInfo &TRI,
@@ -156,13 +157,13 @@ class AMDGPURegBankLegalizeCombiner {
156157
// %Dst:lane-mask(s1) = G_AMDGPU_COPY_VCC_SCC %TruncS32Src:sgpr(s32)
157158
if (isLaneMask(Dst) && MRI.getRegBankOrNull(Src) == SgprRB) {
158159
auto [Trunc, TruncS32Src] = tryMatch(Src, AMDGPU::G_TRUNC);
159-
assert(Trunc && MRI.getType(TruncS32Src) == S32 &&
160+
assert(Trunc && MRI.getType(TruncS32Src) == I32 &&
160161
"sgpr S1 must be result of G_TRUNC of sgpr S32");
161162

162163
B.setInstr(MI);
163164
// Ensure that truncated bits in BoolSrc are 0.
164-
auto One = B.buildConstant({SgprRB, S32}, 1);
165-
auto BoolSrc = B.buildAnd({SgprRB, S32}, TruncS32Src, One);
165+
auto One = B.buildConstant({SgprRB, I32}, 1);
166+
auto BoolSrc = B.buildAnd({SgprRB, I32}, TruncS32Src, One);
166167
B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {Dst}, {BoolSrc});
167168
cleanUpAfterCombine(MI, Trunc);
168169
return;
@@ -192,7 +193,7 @@ class AMDGPURegBankLegalizeCombiner {
192193
// %Dst = G_... %TruncSrc
193194
Register Dst = MI.getOperand(0).getReg();
194195
Register Src = MI.getOperand(1).getReg();
195-
if (MRI.getType(Src) != S1)
196+
if (MRI.getType(Src) != I1)
196197
return;
197198

198199
auto [Trunc, TruncSrc] = tryMatch(Src, AMDGPU::G_TRUNC);
@@ -210,20 +211,20 @@ class AMDGPURegBankLegalizeCombiner {
210211

211212
B.setInstr(MI);
212213

213-
if (DstTy == S32 && TruncSrcTy == S64) {
214-
auto Unmerge = B.buildUnmerge({SgprRB, S32}, TruncSrc);
214+
if (DstTy == I32 && TruncSrcTy == I64) {
215+
auto Unmerge = B.buildUnmerge({SgprRB, I32}, TruncSrc);
215216
MRI.replaceRegWith(Dst, Unmerge.getReg(0));
216217
cleanUpAfterCombine(MI, Trunc);
217218
return;
218219
}
219220

220-
if (DstTy == S32 && TruncSrcTy == S16) {
221+
if (DstTy == I32 && TruncSrcTy == I16) {
221222
B.buildAnyExt(Dst, TruncSrc);
222223
cleanUpAfterCombine(MI, Trunc);
223224
return;
224225
}
225226

226-
if (DstTy == S16 && TruncSrcTy == S32) {
227+
if (DstTy == I16 && TruncSrcTy == I32) {
227228
B.buildTrunc(Dst, TruncSrc);
228229
cleanUpAfterCombine(MI, Trunc);
229230
return;
@@ -305,7 +306,7 @@ bool AMDGPURegBankLegalize::runOnMachineFunction(MachineFunction &MF) {
305306
// Opcodes that support pretty much all combinations of reg banks and LLTs
306307
// (except S1). There is no point in writing rules for them.
307308
if (Opc == AMDGPU::G_BUILD_VECTOR || Opc == AMDGPU::G_UNMERGE_VALUES ||
308-
Opc == AMDGPU::G_MERGE_VALUES) {
309+
Opc == AMDGPU::G_MERGE_VALUES || Opc == AMDGPU::G_BITCAST) {
309310
RBLHelper.applyMappingTrivial(*MI);
310311
continue;
311312
}

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,7 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
215215
LLT EltTy = DstTy.getElementType();
216216
B128 = LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
217217
} else {
218-
B128 = LLT::scalar(128);
218+
B128 = LLT::integer(128);
219219
}
220220
if (Size / 128 == 2)
221221
splitLoad(MI, {B128, B128});

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 29 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1969,7 +1969,7 @@ bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
19691969
IsDivergentIdx, &Subtarget))
19701970
return false;
19711971

1972-
LLT S32 = LLT::scalar(32);
1972+
LLT I32 = LLT::integer(32);
19731973

19741974
const RegisterBank &DstBank =
19751975
*OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
@@ -1981,10 +1981,10 @@ bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
19811981
SrcBank == AMDGPU::SGPRRegBank &&
19821982
IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
19831983
: AMDGPU::VCCRegBank;
1984-
LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
1984+
LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? I32 : LLT::integer(1);
19851985

19861986
if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
1987-
Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
1987+
Idx = B.buildCopy(I32, Idx)->getOperand(0).getReg();
19881988
MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
19891989
}
19901990

@@ -1996,13 +1996,19 @@ bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
19961996
else
19971997
EltTy = MRI.getType(DstRegs[0]);
19981998

1999+
if (VecTy.isFloatVector()) {
2000+
auto ClassOrBank = MRI.getRegClassOrRegBank(VecReg);
2001+
VecReg = B.buildBitcast({ClassOrBank, VecTy.changeToInteger()}, VecReg).getReg(0);
2002+
}
2003+
19992004
auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
20002005
SmallVector<Register, 2> Res(NumLanes);
2001-
for (unsigned L = 0; L < NumLanes; ++L)
2006+
for (unsigned L = 0; L < NumLanes; ++L) {
20022007
Res[L] = UnmergeToEltTy.getReg(L);
2008+
}
20032009

20042010
for (unsigned I = 1; I < NumElem; ++I) {
2005-
auto IC = B.buildConstant(S32, I);
2011+
auto IC = B.buildConstant(I32, I);
20062012
MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
20072013
auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
20082014
MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
@@ -2067,7 +2073,7 @@ bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
20672073
IsDivergentIdx, &Subtarget))
20682074
return false;
20692075

2070-
LLT S32 = LLT::scalar(32);
2076+
LLT I32 = LLT::integer(32);
20712077

20722078
const RegisterBank &DstBank =
20732079
*OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
@@ -2082,10 +2088,10 @@ bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
20822088
InsBank == AMDGPU::SGPRRegBank &&
20832089
IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
20842090
: AMDGPU::VCCRegBank;
2085-
LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
2091+
LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? I32 : LLT::integer(1);
20862092

20872093
if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
2088-
Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
2094+
Idx = B.buildCopy(I32, Idx)->getOperand(0).getReg();
20892095
MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
20902096
}
20912097

@@ -2099,11 +2105,17 @@ bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
20992105
EltTy = MRI.getType(InsRegs[0]);
21002106
}
21012107

2108+
if (VecTy.getScalarType().isFloat() && !EltTy.isFloat()) {
2109+
auto RegBankOrClass = MRI.getRegClassOrRegBank(VecReg);
2110+
auto CastTy = VecTy.changeToInteger();
2111+
VecReg = B.buildBitcast({RegBankOrClass, CastTy}, VecReg).getReg(0);
2112+
}
2113+
21022114
auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
21032115
SmallVector<Register, 16> Ops(NumElem * NumLanes);
21042116

21052117
for (unsigned I = 0; I < NumElem; ++I) {
2106-
auto IC = B.buildConstant(S32, I);
2118+
auto IC = B.buildConstant(I32, I);
21072119
MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
21082120
auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
21092121
MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
@@ -2156,7 +2168,7 @@ void AMDGPURegisterBankInfo::applyMappingSMULU64(
21562168
MachineRegisterInfo &MRI = OpdMapper.getMRI();
21572169
MachineInstr &MI = OpdMapper.getMI();
21582170
Register DstReg = MI.getOperand(0).getReg();
2159-
LLT HalfTy = LLT::scalar(32);
2171+
LLT HalfTy = LLT::integer(32);
21602172

21612173
// Depending on where the source registers came from, the generic code may
21622174
// have decided to split the inputs already or not. If not, we still need to
@@ -2828,7 +2840,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
28282840
Register DstReg = MI.getOperand(0).getReg();
28292841
Register SrcReg = MI.getOperand(1).getReg();
28302842

2831-
const LLT S32 = LLT::scalar(32);
2843+
const LLT I32 = LLT::integer(32);
28322844
LLT DstTy = MRI.getType(DstReg);
28332845
LLT SrcTy = MRI.getType(SrcReg);
28342846

@@ -2891,10 +2903,10 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
28912903

28922904
assert(DstTy.getSizeInBits() == 64);
28932905

2894-
LLT Vec32 = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32);
2906+
LLT Vec32 = LLT::fixed_vector(2 * SrcTy.getNumElements(), I32);
28952907

28962908
auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2897-
auto One = B.buildConstant(S32, 1);
2909+
auto One = B.buildConstant(I32, 1);
28982910

28992911
MachineBasicBlock::iterator MII = MI.getIterator();
29002912

@@ -2905,8 +2917,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
29052917
MachineInstrSpan Span(MII, &B.getMBB());
29062918

29072919
// Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2908-
auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2909-
auto IdxHi = B.buildAdd(S32, IdxLo, One);
2920+
auto IdxLo = B.buildShl(I32, BaseIdxReg, One);
2921+
auto IdxHi = B.buildAdd(I32, IdxLo, One);
29102922

29112923
auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
29122924
auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi);
@@ -2932,8 +2944,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
29322944

29332945
if (NeedCopyToVGPR) {
29342946
MachineBasicBlock *LoopBB = Extract1->getParent();
2935-
Register TmpReg0 = MRI.createGenericVirtualRegister(S32);
2936-
Register TmpReg1 = MRI.createGenericVirtualRegister(S32);
2947+
Register TmpReg0 = MRI.createGenericVirtualRegister(I32);
2948+
Register TmpReg1 = MRI.createGenericVirtualRegister(I32);
29372949
MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank);
29382950
MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank);
29392951

0 commit comments

Comments
 (0)