Skip to content

Commit 83532f0

Browse files
committed
Added VGPR_16 to GISEL register bank, support G_BUILD GISEL in true16
1 parent c54616e commit 83532f0

File tree

9 files changed

+861
-365
lines changed

9 files changed

+861
-365
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -700,9 +700,22 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
700700
return true;
701701

702702
// TODO: This should probably be a combine somewhere
703-
// (build_vector $src0, undef) -> copy $src0
704703
MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
705704
if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
705+
if (Subtarget->useRealTrue16Insts() && IsVector) {
706+
// (vecTy (DivergentBinFrag<build_vector> Ty:$src0, (Ty undef))),
707+
// -> (vecTy (INSERT_SUBREG (IMPLICIT_DEF), VGPR_16:$src0, lo16))
708+
Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
709+
BuildMI(*BB, &MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef);
710+
BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::INSERT_SUBREG), Dst)
711+
.addReg(Undef)
712+
.addReg(Src0)
713+
.addImm(AMDGPU::lo16);
714+
MI.eraseFromParent();
715+
return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) &&
716+
RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_16RegClass, *MRI);
717+
}
718+
// (build_vector $src0, undef) -> copy $src0
706719
MI.setDesc(TII.get(AMDGPU::COPY));
707720
MI.removeOperand(2);
708721
const auto &RC =

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 29 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -223,8 +223,9 @@ static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
223223
};
224224
}
225225

226-
static bool isRegisterSize(unsigned Size) {
227-
return Size % 32 == 0 && Size <= MaxRegisterSize;
226+
static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size) {
227+
return ((ST.useRealTrue16Insts() && Size == 16) || Size % 32 == 0) &&
228+
Size <= MaxRegisterSize;
228229
}
229230

230231
static bool isRegisterVectorElementType(LLT EltTy) {
@@ -240,8 +241,8 @@ static bool isRegisterVectorType(LLT Ty) {
240241
}
241242

242243
// TODO: replace all uses of isRegisterType with isRegisterClassType
243-
static bool isRegisterType(LLT Ty) {
244-
if (!isRegisterSize(Ty.getSizeInBits()))
244+
static bool isRegisterType(const GCNSubtarget &ST, LLT Ty) {
245+
if (!isRegisterSize(ST, Ty.getSizeInBits()))
245246
return false;
246247

247248
if (Ty.isVector())
@@ -252,19 +253,19 @@ static bool isRegisterType(LLT Ty) {
252253

253254
// Any combination of 32 or 64-bit elements up the maximum register size, and
254255
// multiples of v2s16.
255-
static LegalityPredicate isRegisterType(unsigned TypeIdx) {
256-
return [=](const LegalityQuery &Query) {
257-
return isRegisterType(Query.Types[TypeIdx]);
256+
static LegalityPredicate isRegisterType(const GCNSubtarget &ST, unsigned TypeIdx) {
257+
return [=, &ST](const LegalityQuery &Query) {
258+
return isRegisterType(ST, Query.Types[TypeIdx]);
258259
};
259260
}
260261

261262
// RegisterType that doesn't have a corresponding RegClass.
262263
// TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
263264
// should be removed.
264-
static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) {
265-
return [=](const LegalityQuery &Query) {
265+
static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST, unsigned TypeIdx) {
266+
return [=, &ST](const LegalityQuery &Query) {
266267
LLT Ty = Query.Types[TypeIdx];
267-
return isRegisterType(Ty) &&
268+
return isRegisterType(ST, Ty) &&
268269
!SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits());
269270
};
270271
}
@@ -348,17 +349,19 @@ static std::initializer_list<LLT> AllS64Vectors = {V2S64, V3S64, V4S64, V5S64,
348349
V6S64, V7S64, V8S64, V16S64};
349350

350351
// Checks whether a type is in the list of legal register types.
351-
static bool isRegisterClassType(LLT Ty) {
352+
static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty) {
352353
if (Ty.isPointerOrPointerVector())
353354
Ty = Ty.changeElementType(LLT::scalar(Ty.getScalarSizeInBits()));
354355

355356
return is_contained(AllS32Vectors, Ty) || is_contained(AllS64Vectors, Ty) ||
356-
is_contained(AllScalarTypes, Ty) || is_contained(AllS16Vectors, Ty);
357+
is_contained(AllScalarTypes, Ty) ||
358+
(ST.useRealTrue16Insts() && Ty == S16) ||
359+
is_contained(AllS16Vectors, Ty);
357360
}
358361

359-
static LegalityPredicate isRegisterClassType(unsigned TypeIdx) {
360-
return [TypeIdx](const LegalityQuery &Query) {
361-
return isRegisterClassType(Query.Types[TypeIdx]);
362+
static LegalityPredicate isRegisterClassType(const GCNSubtarget &ST, unsigned TypeIdx) {
363+
return [&ST, TypeIdx](const LegalityQuery &Query) {
364+
return isRegisterClassType(ST, Query.Types[TypeIdx]);
362365
};
363366
}
364367

@@ -510,7 +513,7 @@ static bool loadStoreBitcastWorkaround(const LLT Ty) {
510513

511514
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
512515
const LLT Ty = Query.Types[0];
513-
return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
516+
return isRegisterType(ST, Ty) && isLoadStoreSizeLegal(ST, Query) &&
514517
!hasBufferRsrcWorkaround(Ty) && !loadStoreBitcastWorkaround(Ty);
515518
}
516519

@@ -523,12 +526,12 @@ static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
523526
if (Size != MemSizeInBits)
524527
return Size <= 32 && Ty.isVector();
525528

526-
if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
529+
if (loadStoreBitcastWorkaround(Ty) && isRegisterType(ST, Ty))
527530
return true;
528531

529532
// Don't try to handle bitcasting vector ext loads for now.
530533
return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
531-
(Size <= 32 || isRegisterSize(Size)) &&
534+
(Size <= 32 || isRegisterSize(ST, Size)) &&
532535
!isRegisterVectorElementType(Ty.getElementType());
533536
}
534537

@@ -875,7 +878,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
875878

876879
getActionDefinitionsBuilder(G_BITCAST)
877880
// Don't worry about the size constraint.
878-
.legalIf(all(isRegisterClassType(0), isRegisterClassType(1)))
881+
.legalIf(all(isRegisterClassType(ST, 0), isRegisterClassType(ST, 1)))
879882
.lower();
880883

881884
getActionDefinitionsBuilder(G_CONSTANT)
@@ -890,7 +893,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
890893
.clampScalar(0, S16, S64);
891894

892895
getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
893-
.legalIf(isRegisterClassType(0))
896+
.legalIf(isRegisterClassType(ST, 0))
894897
// s1 and s16 are special cases because they have legal operations on
895898
// them, but don't really occupy registers in the normal way.
896899
.legalFor({S1, S16})
@@ -1825,7 +1828,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
18251828
.clampMaxNumElements(VecTypeIdx, S32, 32)
18261829
// TODO: Clamp elements for 64-bit vectors?
18271830
.moreElementsIf(
1828-
isIllegalRegisterType(VecTypeIdx),
1831+
isIllegalRegisterType(ST, VecTypeIdx),
18291832
moreElementsToNextExistingRegClass(VecTypeIdx))
18301833
// It should only be necessary with variable indexes.
18311834
// As a last resort, lower to the stack
@@ -1883,7 +1886,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
18831886
.clampNumElements(0, V2S64, V16S64)
18841887
.fewerElementsIf(isWideVec16(0), changeTo(0, V2S16))
18851888
.moreElementsIf(
1886-
isIllegalRegisterType(0),
1889+
isIllegalRegisterType(ST, 0),
18871890
moreElementsToNextExistingRegClass(0));
18881891

18891892
if (ST.hasScalarPackInsts()) {
@@ -1904,11 +1907,11 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
19041907
.lower();
19051908
}
19061909

1907-
BuildVector.legalIf(isRegisterType(0));
1910+
BuildVector.legalIf(isRegisterType(ST, 0));
19081911

19091912
// FIXME: Clamp maximum size
19101913
getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1911-
.legalIf(all(isRegisterType(0), isRegisterType(1)))
1914+
.legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1)))
19121915
.clampMaxNumElements(0, S32, 32)
19131916
.clampMaxNumElements(1, S16, 2) // TODO: Make 4?
19141917
.clampMaxNumElements(0, S16, 64);
@@ -1933,7 +1936,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
19331936
};
19341937

19351938
auto &Builder = getActionDefinitionsBuilder(Op)
1936-
.legalIf(all(isRegisterType(0), isRegisterType(1)))
1939+
.legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1)))
19371940
.lowerFor({{S16, V2S16}})
19381941
.lowerIf([=](const LegalityQuery &Query) {
19391942
const LLT BigTy = Query.Types[BigTyIdx];
@@ -3149,7 +3152,7 @@ bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
31493152
} else {
31503153
// Extract the subvector.
31513154

3152-
if (isRegisterType(ValTy)) {
3155+
if (isRegisterType(ST, ValTy)) {
31533156
// If this a case where G_EXTRACT is legal, use it.
31543157
// (e.g. <3 x s32> -> <4 x s32>)
31553158
WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);

llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ def SGPRRegBank : RegisterBank<"SGPR",
1111
>;
1212

1313
def VGPRRegBank : RegisterBank<"VGPR",
14-
[VGPR_32, VReg_64, VReg_96, VReg_128, VReg_160, VReg_192, VReg_224, VReg_256, VReg_288, VReg_320, VReg_352, VReg_384, VReg_512, VReg_1024]
14+
[VGPR_16_Lo128, VGPR_16, VGPR_32, VReg_64, VReg_96, VReg_128, VReg_160, VReg_192, VReg_224, VReg_256, VReg_288, VReg_320, VReg_352, VReg_384, VReg_512, VReg_1024]
1515
>;
1616

1717
// It is helpful to distinguish conditions from ordinary SGPRs.

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3275,6 +3275,8 @@ def : GCNPat <
32753275
(COPY_TO_REGCLASS SReg_32:$src0, SReg_32)
32763276
>;
32773277

3278+
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
3279+
let True16Predicate = p in {
32783280
def : GCNPat <
32793281
(vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$src0), (Ty undef))),
32803282
(COPY_TO_REGCLASS VGPR_32:$src0, VGPR_32)
@@ -3284,6 +3286,7 @@ def : GCNPat <
32843286
(vecTy (UniformBinFrag<build_vector> (Ty undef), (Ty SReg_32:$src1))),
32853287
(S_LSHL_B32 SReg_32:$src1, (i32 16))
32863288
>;
3289+
}
32873290

32883291
def : GCNPat <
32893292
(vecTy (DivergentBinFrag<build_vector> (Ty undef), (Ty VGPR_32:$src1))),
@@ -3293,6 +3296,8 @@ def : GCNPat <
32933296
}
32943297

32953298
let SubtargetPredicate = HasVOP3PInsts in {
3299+
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
3300+
let True16Predicate = p in
32963301
def : GCNPat <
32973302
(v2i16 (DivergentBinFrag<build_vector> (i16 VGPR_32:$src0), (i16 VGPR_32:$src1))),
32983303
(v2i16 (V_LSHL_OR_B32_e64 $src1, (i32 16), (i32 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), $src0))))
@@ -3322,12 +3327,25 @@ def : GCNPat <
33223327
(S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1)
33233328
>;
33243329

3330+
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
3331+
let True16Predicate = p in
33253332
// Take the lower 16 bits from each VGPR_32 and concat them
33263333
def : GCNPat <
33273334
(vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$a), (Ty VGPR_32:$b))),
33283335
(V_PERM_B32_e64 VGPR_32:$b, VGPR_32:$a, (S_MOV_B32 (i32 0x05040100)))
33293336
>;
33303337

3338+
let True16Predicate = UseRealTrue16Insts in {
3339+
def : GCNPat <
3340+
(vecTy (DivergentBinFrag<build_vector> (Ty VGPR_16:$a), (Ty VGPR_16:$b))),
3341+
(REG_SEQUENCE VGPR_32, VGPR_16:$a, lo16, VGPR_16:$b, hi16)
3342+
>;
3343+
// GISel ignores this Pat, but the equivalent is done in selectG_BUILD_VECTOR
3344+
def : GCNPat <
3345+
(vecTy (build_vector (Ty VGPR_16:$src0), (Ty undef))),
3346+
(REG_SEQUENCE VGPR_32, $src0, lo16, (IMPLICIT_DEF), hi16)
3347+
>;
3348+
}
33313349

33323350
// Take the lower 16 bits from V[0] and the upper 16 bits from V[1]
33333351
// Special case, can use V_BFI (0xffff literal likely more reusable than 0x70601000)
@@ -3353,6 +3371,8 @@ def : GCNPat <
33533371

33543372
// Take the upper 16 bits from V[0] and the lower 16 bits from V[1]
33553373
// Special case, can use V_ALIGNBIT (always uses encoded literal)
3374+
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
3375+
let True16Predicate = p in
33563376
def : GCNPat <
33573377
(vecTy (DivergentBinFrag<build_vector>
33583378
(Ty !if(!eq(Ty, i16),

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ static cl::opt<bool> EnableSpillSGPRToVGPR(
3535
cl::ReallyHidden,
3636
cl::init(true));
3737

38-
std::array<std::vector<int16_t>, 16> SIRegisterInfo::RegSplitParts;
38+
std::array<std::vector<int16_t>, 32> SIRegisterInfo::RegSplitParts;
3939
std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable;
4040

4141
// Map numbers of DWORDs to indexes in SubRegFromChannelTable.
@@ -343,9 +343,9 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST)
343343
static auto InitializeRegSplitPartsOnce = [this]() {
344344
for (unsigned Idx = 1, E = getNumSubRegIndices() - 1; Idx < E; ++Idx) {
345345
unsigned Size = getSubRegIdxSize(Idx);
346-
if (Size & 31)
346+
if (Size & 15)
347347
continue;
348-
std::vector<int16_t> &Vec = RegSplitParts[Size / 32 - 1];
348+
std::vector<int16_t> &Vec = RegSplitParts[Size / 16 - 1];
349349
unsigned Pos = getSubRegIdxOffset(Idx);
350350
if (Pos % Size)
351351
continue;
@@ -3561,14 +3561,14 @@ bool SIRegisterInfo::isUniformReg(const MachineRegisterInfo &MRI,
35613561
ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC,
35623562
unsigned EltSize) const {
35633563
const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC);
3564-
assert(RegBitWidth >= 32 && RegBitWidth <= 1024);
3564+
assert(RegBitWidth >= 32 && RegBitWidth <= 1024 && EltSize >= 2);
35653565

3566-
const unsigned RegDWORDs = RegBitWidth / 32;
3567-
const unsigned EltDWORDs = EltSize / 4;
3568-
assert(RegSplitParts.size() + 1 >= EltDWORDs);
3566+
const unsigned RegHalves = RegBitWidth / 16;
3567+
const unsigned EltHalves = EltSize / 2;
3568+
assert(RegSplitParts.size() + 1 >= EltHalves);
35693569

3570-
const std::vector<int16_t> &Parts = RegSplitParts[EltDWORDs - 1];
3571-
const unsigned NumParts = RegDWORDs / EltDWORDs;
3570+
const std::vector<int16_t> &Parts = RegSplitParts[EltHalves - 1];
3571+
const unsigned NumParts = RegHalves / EltHalves;
35723572

35733573
return ArrayRef(Parts.data(), NumParts);
35743574
}

llvm/lib/Target/AMDGPU/SIRegisterInfo.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,11 +37,11 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
3737
BitVector RegPressureIgnoredUnits;
3838

3939
/// Sub reg indexes for getRegSplitParts.
40-
/// First index represents subreg size from 1 to 16 DWORDs.
40+
/// First index represents subreg size from 1 to 32 Half DWORDS.
4141
/// The inner vector is sorted by bit offset.
4242
/// Provided a register can be fully split with given subregs,
4343
/// all elements of the inner vector combined give a full lane mask.
44-
static std::array<std::vector<int16_t>, 16> RegSplitParts;
44+
static std::array<std::vector<int16_t>, 32> RegSplitParts;
4545

4646
// Table representing sub reg of given width and offset.
4747
// First index is subreg size: 32, 64, 96, 128, 160, 192, 224, 256, 512.

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2427,6 +2427,8 @@ bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo) {
24272427
// (move from MC* level to Target* level). Return size in bits.
24282428
unsigned getRegBitWidth(unsigned RCID) {
24292429
switch (RCID) {
2430+
case AMDGPU::VGPR_16RegClassID:
2431+
case AMDGPU::VGPR_16_Lo128RegClassID:
24302432
case AMDGPU::SGPR_LO16RegClassID:
24312433
case AMDGPU::AGPR_LO16RegClassID:
24322434
return 16;

0 commit comments

Comments
 (0)