Skip to content

Commit 6aebbb0

Browse files
authored
[AMDGPU] Define 1024 VGPRs on gfx1250 (llvm#156765)
This is a baseline support, it is not useable yet.
1 parent a34b110 commit 6aebbb0

33 files changed

+671
-505
lines changed

llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -564,6 +564,14 @@ class AMDGPUOperand : public MCParsedAsmOperand {
564564
return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::i32);
565565
}
566566

567+
bool isVCSrc_b32_Lo256() const {
568+
return isRegOrInlineNoMods(AMDGPU::VS_32_Lo256RegClassID, MVT::i32);
569+
}
570+
571+
bool isVCSrc_b64_Lo256() const {
572+
return isRegOrInlineNoMods(AMDGPU::VS_64_Lo256RegClassID, MVT::i64);
573+
}
574+
567575
bool isVCSrc_b64() const {
568576
return isRegOrInlineNoMods(AMDGPU::VS_64RegClassID, MVT::i64);
569577
}
@@ -2986,7 +2994,12 @@ MCRegister AMDGPUAsmParser::getRegularReg(RegisterKind RegKind, unsigned RegNum,
29862994

29872995
const MCRegisterInfo *TRI = getContext().getRegisterInfo();
29882996
const MCRegisterClass RC = TRI->getRegClass(RCID);
2989-
if (RegIdx >= RC.getNumRegs()) {
2997+
if (RegIdx >= RC.getNumRegs() || (RegKind == IS_VGPR && RegIdx > 255)) {
2998+
Error(Loc, "register index is out of range");
2999+
return AMDGPU::NoRegister;
3000+
}
3001+
3002+
if (RegKind == IS_VGPR && !isGFX1250() && RegIdx + RegWidth / 32 > 256) {
29903003
Error(Loc, "register index is out of range");
29913004
return MCRegister();
29923005
}

llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1223,6 +1223,26 @@ void AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const {
12231223
}
12241224
}
12251225

1226+
// Given a wide tuple \p Reg check if it will overflow 256 registers.
1227+
// \returns \p Reg on success or NoRegister otherwise.
1228+
static unsigned CheckVGPROverflow(unsigned Reg, const MCRegisterClass &RC,
1229+
const MCRegisterInfo &MRI) {
1230+
unsigned NumRegs = RC.getSizeInBits() / 32;
1231+
MCRegister Sub0 = MRI.getSubReg(Reg, AMDGPU::sub0);
1232+
if (!Sub0)
1233+
return Reg;
1234+
1235+
MCRegister BaseReg;
1236+
if (MRI.getRegClass(AMDGPU::VGPR_32RegClassID).contains(Sub0))
1237+
BaseReg = AMDGPU::VGPR0;
1238+
else if (MRI.getRegClass(AMDGPU::AGPR_32RegClassID).contains(Sub0))
1239+
BaseReg = AMDGPU::AGPR0;
1240+
1241+
assert(BaseReg && "Only vector registers expected");
1242+
1243+
return (Sub0 - BaseReg + NumRegs <= 256) ? Reg : AMDGPU::NoRegister;
1244+
}
1245+
12261246
// Note that before gfx10, the MIMG encoding provided no information about
12271247
// VADDR size. Consequently, decoded instructions always show address as if it
12281248
// has 1 dword, which could be not really so.
@@ -1327,8 +1347,9 @@ void AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
13271347
MCRegister VdataSub0 = MRI.getSubReg(Vdata0, AMDGPU::sub0);
13281348
Vdata0 = (VdataSub0 != 0)? VdataSub0 : Vdata0;
13291349

1330-
NewVdata = MRI.getMatchingSuperReg(Vdata0, AMDGPU::sub0,
1331-
&MRI.getRegClass(DataRCID));
1350+
const MCRegisterClass &NewRC = MRI.getRegClass(DataRCID);
1351+
NewVdata = MRI.getMatchingSuperReg(Vdata0, AMDGPU::sub0, &NewRC);
1352+
NewVdata = CheckVGPROverflow(NewVdata, NewRC, MRI);
13321353
if (!NewVdata) {
13331354
// It's possible to encode this such that the low register + enabled
13341355
// components exceeds the register count.
@@ -1347,8 +1368,9 @@ void AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
13471368
VAddrSA = VAddrSubSA ? VAddrSubSA : VAddrSA;
13481369

13491370
auto AddrRCID = MCII->get(NewOpcode).operands()[VAddrSAIdx].RegClass;
1350-
NewVAddrSA = MRI.getMatchingSuperReg(VAddrSA, AMDGPU::sub0,
1351-
&MRI.getRegClass(AddrRCID));
1371+
const MCRegisterClass &NewRC = MRI.getRegClass(AddrRCID);
1372+
NewVAddrSA = MRI.getMatchingSuperReg(VAddrSA, AMDGPU::sub0, &NewRC);
1373+
NewVAddrSA = CheckVGPROverflow(NewVAddrSA, NewRC, MRI);
13521374
if (!NewVAddrSA)
13531375
return;
13541376
}

llvm/lib/Target/AMDGPU/GCNSubtarget.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -577,6 +577,7 @@ GCNSubtarget::getMaxNumVectorRegs(const Function &F) const {
577577

578578
unsigned MaxNumVGPRs = MaxVectorRegs;
579579
unsigned MaxNumAGPRs = 0;
580+
unsigned NumArchVGPRs = has1024AddressableVGPRs() ? 1024 : 256;
580581

581582
// On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically,
582583
// a wave may have up to 512 total vector registers combining together both
@@ -589,7 +590,6 @@ GCNSubtarget::getMaxNumVectorRegs(const Function &F) const {
589590
if (hasGFX90AInsts()) {
590591
unsigned MinNumAGPRs = 0;
591592
const unsigned TotalNumAGPRs = AMDGPU::AGPR_32RegClass.getNumRegs();
592-
const unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
593593

594594
const std::pair<unsigned, unsigned> DefaultNumAGPR = {~0u, ~0u};
595595

@@ -614,11 +614,11 @@ GCNSubtarget::getMaxNumVectorRegs(const Function &F) const {
614614
MaxNumAGPRs = std::min(std::max(MinNumAGPRs, MaxNumAGPRs), MaxVectorRegs);
615615
MinNumAGPRs = std::min(std::min(MinNumAGPRs, TotalNumAGPRs), MaxNumAGPRs);
616616

617-
MaxNumVGPRs = std::min(MaxVectorRegs - MinNumAGPRs, TotalNumVGPRs);
617+
MaxNumVGPRs = std::min(MaxVectorRegs - MinNumAGPRs, NumArchVGPRs);
618618
MaxNumAGPRs = std::min(MaxVectorRegs - MaxNumVGPRs, MaxNumAGPRs);
619619

620620
assert(MaxNumVGPRs + MaxNumAGPRs <= MaxVectorRegs &&
621-
MaxNumAGPRs <= TotalNumAGPRs && MaxNumVGPRs <= TotalNumVGPRs &&
621+
MaxNumAGPRs <= TotalNumAGPRs && MaxNumVGPRs <= NumArchVGPRs &&
622622
"invalid register counts");
623623
} else if (hasMAIInsts()) {
624624
// On gfx908 the number of AGPRs always equals the number of VGPRs.

llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -402,7 +402,7 @@ void AMDGPUMCCodeEmitter::encodeInstruction(const MCInst &MI,
402402
if (AMDGPU::isGFX10Plus(STI) && isVCMPX64(Desc)) {
403403
assert((Encoding & 0xFF) == 0);
404404
Encoding |= MRI.getEncodingValue(AMDGPU::EXEC_LO) &
405-
AMDGPU::HWEncoding::REG_IDX_MASK;
405+
AMDGPU::HWEncoding::LO256_REG_IDX_MASK;
406406
}
407407

408408
for (unsigned i = 0; i < bytes; i++) {
@@ -551,7 +551,7 @@ void AMDGPUMCCodeEmitter::getAVOperandEncoding(
551551
SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const {
552552
MCRegister Reg = MI.getOperand(OpNo).getReg();
553553
unsigned Enc = MRI.getEncodingValue(Reg);
554-
unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK;
554+
unsigned Idx = Enc & AMDGPU::HWEncoding::LO256_REG_IDX_MASK;
555555
bool IsVGPROrAGPR =
556556
Enc & (AMDGPU::HWEncoding::IS_VGPR | AMDGPU::HWEncoding::IS_AGPR);
557557

@@ -593,7 +593,7 @@ void AMDGPUMCCodeEmitter::getMachineOpValue(const MCInst &MI,
593593
const MCSubtargetInfo &STI) const {
594594
if (MO.isReg()){
595595
unsigned Enc = MRI.getEncodingValue(MO.getReg());
596-
unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK;
596+
unsigned Idx = Enc & AMDGPU::HWEncoding::LO256_REG_IDX_MASK;
597597
bool IsVGPROrAGPR =
598598
Enc & (AMDGPU::HWEncoding::IS_VGPR | AMDGPU::HWEncoding::IS_AGPR);
599599
Op = Idx | (IsVGPROrAGPR << 8);
@@ -656,7 +656,7 @@ void AMDGPUMCCodeEmitter::getMachineOpValueT16Lo128(
656656
const MCOperand &MO = MI.getOperand(OpNo);
657657
if (MO.isReg()) {
658658
uint16_t Encoding = MRI.getEncodingValue(MO.getReg());
659-
unsigned RegIdx = Encoding & AMDGPU::HWEncoding::REG_IDX_MASK;
659+
unsigned RegIdx = Encoding & AMDGPU::HWEncoding::LO256_REG_IDX_MASK;
660660
bool IsHi = Encoding & AMDGPU::HWEncoding::IS_HI16;
661661
bool IsVGPR = Encoding & AMDGPU::HWEncoding::IS_VGPR;
662662
assert((!IsVGPR || isUInt<7>(RegIdx)) && "VGPR0-VGPR127 expected!");

llvm/lib/Target/AMDGPU/SIDefines.h

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -354,10 +354,11 @@ enum : unsigned {
354354
// Register codes as defined in the TableGen's HWEncoding field.
355355
namespace HWEncoding {
356356
enum : unsigned {
357-
REG_IDX_MASK = 0xff,
358-
IS_VGPR = 1 << 8,
359-
IS_AGPR = 1 << 9,
360-
IS_HI16 = 1 << 10,
357+
REG_IDX_MASK = 0x3ff,
358+
LO256_REG_IDX_MASK = 0xff,
359+
IS_VGPR = 1 << 10,
360+
IS_AGPR = 1 << 11,
361+
IS_HI16 = 1 << 12,
361362
};
362363
} // namespace HWEncoding
363364

llvm/lib/Target/AMDGPU/SIFrameLowering.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1728,7 +1728,9 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
17281728
"Whole wave functions can use the reg mapped for their i1 argument");
17291729

17301730
// FIXME: Be more efficient!
1731-
for (MCRegister Reg : AMDGPU::VGPR_32RegClass)
1731+
unsigned NumArchVGPRs = ST.has1024AddressableVGPRs() ? 1024 : 256;
1732+
for (MCRegister Reg :
1733+
AMDGPU::VGPR_32RegClass.getRegisters().take_front(NumArchVGPRs))
17321734
if (MF.getRegInfo().isPhysRegModified(Reg)) {
17331735
MFI->reserveWWMRegister(Reg);
17341736
MF.begin()->addLiveIn(Reg);

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16916,7 +16916,7 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
1691616916
switch (BitWidth) {
1691716917
case 16:
1691816918
RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
16919-
: &AMDGPU::VGPR_32RegClass;
16919+
: &AMDGPU::VGPR_32_Lo256RegClass;
1692016920
break;
1692116921
default:
1692216922
RC = TRI->getVGPRClassForBitWidth(BitWidth);
@@ -16963,7 +16963,7 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
1696316963
auto [Kind, Idx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint);
1696416964
if (Kind != '\0') {
1696516965
if (Kind == 'v') {
16966-
RC = &AMDGPU::VGPR_32RegClass;
16966+
RC = &AMDGPU::VGPR_32_Lo256RegClass;
1696716967
} else if (Kind == 's') {
1696816968
RC = &AMDGPU::SGPR_32RegClass;
1696916969
} else if (Kind == 'a') {
@@ -17005,6 +17005,7 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
1700517005
return std::pair(0U, nullptr);
1700617006
if (Idx < RC->getNumRegs())
1700717007
return std::pair(RC->getRegister(Idx), RC);
17008+
return std::pair(0U, nullptr);
1700817009
}
1700917010
}
1701017011

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ static constexpr StringLiteral WaitEventTypeName[] = {
152152
// We reserve a fixed number of VGPR slots in the scoring tables for
153153
// special tokens like SCMEM_LDS (needed for buffer load to LDS).
154154
enum RegisterMapping {
155-
SQ_MAX_PGM_VGPRS = 1024, // Maximum programmable VGPRs across all targets.
155+
SQ_MAX_PGM_VGPRS = 2048, // Maximum programmable VGPRs across all targets.
156156
AGPR_OFFSET = 512, // Maximum programmable ArchVGPRs across all targets.
157157
SQ_MAX_PGM_SGPRS = 128, // Maximum programmable SGPRs across all targets.
158158
// Artificial register slots to track LDS writes into specific LDS locations
@@ -831,15 +831,14 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
831831

832832
MCRegister MCReg = AMDGPU::getMCReg(Op.getReg(), *Context->ST);
833833
unsigned RegIdx = TRI->getHWRegIndex(MCReg);
834-
assert(isUInt<8>(RegIdx));
835834

836835
const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg());
837836
unsigned Size = TRI->getRegSizeInBits(*RC);
838837

839838
// AGPRs/VGPRs are tracked every 16 bits, SGPRs by 32 bits
840839
if (TRI->isVectorRegister(*MRI, Op.getReg())) {
841840
unsigned Reg = RegIdx << 1 | (AMDGPU::isHi16Reg(MCReg, *TRI) ? 1 : 0);
842-
assert(Reg < AGPR_OFFSET);
841+
assert(!Context->ST->hasMAIInsts() || Reg < AGPR_OFFSET);
843842
Result.first = Reg;
844843
if (TRI->isAGPR(*MRI, Op.getReg()))
845844
Result.first += AGPR_OFFSET;

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 55 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3273,6 +3273,10 @@ StringRef SIRegisterInfo::getRegAsmName(MCRegister Reg) const {
32733273
return AMDGPUInstPrinter::getRegisterName(Reg);
32743274
}
32753275

3276+
unsigned SIRegisterInfo::getHWRegIndex(MCRegister Reg) const {
3277+
return getEncodingValue(Reg) & AMDGPU::HWEncoding::REG_IDX_MASK;
3278+
}
3279+
32763280
unsigned AMDGPU::getRegBitWidth(const TargetRegisterClass &RC) {
32773281
return getRegBitWidth(RC.getID());
32783282
}
@@ -3353,6 +3357,40 @@ SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) const {
33533357
: getAnyVGPRClassForBitWidth(BitWidth);
33543358
}
33553359

3360+
const TargetRegisterClass *
3361+
SIRegisterInfo::getAlignedLo256VGPRClassForBitWidth(unsigned BitWidth) const {
3362+
if (BitWidth <= 32)
3363+
return &AMDGPU::VGPR_32_Lo256RegClass;
3364+
if (BitWidth <= 64)
3365+
return &AMDGPU::VReg_64_Lo256_Align2RegClass;
3366+
if (BitWidth <= 96)
3367+
return &AMDGPU::VReg_96_Lo256_Align2RegClass;
3368+
if (BitWidth <= 128)
3369+
return &AMDGPU::VReg_128_Lo256_Align2RegClass;
3370+
if (BitWidth <= 160)
3371+
return &AMDGPU::VReg_160_Lo256_Align2RegClass;
3372+
if (BitWidth <= 192)
3373+
return &AMDGPU::VReg_192_Lo256_Align2RegClass;
3374+
if (BitWidth <= 224)
3375+
return &AMDGPU::VReg_224_Lo256_Align2RegClass;
3376+
if (BitWidth <= 256)
3377+
return &AMDGPU::VReg_256_Lo256_Align2RegClass;
3378+
if (BitWidth <= 288)
3379+
return &AMDGPU::VReg_288_Lo256_Align2RegClass;
3380+
if (BitWidth <= 320)
3381+
return &AMDGPU::VReg_320_Lo256_Align2RegClass;
3382+
if (BitWidth <= 352)
3383+
return &AMDGPU::VReg_352_Lo256_Align2RegClass;
3384+
if (BitWidth <= 384)
3385+
return &AMDGPU::VReg_384_Lo256_Align2RegClass;
3386+
if (BitWidth <= 512)
3387+
return &AMDGPU::VReg_512_Lo256_Align2RegClass;
3388+
if (BitWidth <= 1024)
3389+
return &AMDGPU::VReg_1024_Lo256_Align2RegClass;
3390+
3391+
return nullptr;
3392+
}
3393+
33563394
static const TargetRegisterClass *
33573395
getAnyAGPRClassForBitWidth(unsigned BitWidth) {
33583396
if (BitWidth == 64)
@@ -3547,7 +3585,17 @@ bool SIRegisterInfo::isSGPRReg(const MachineRegisterInfo &MRI,
35473585
const TargetRegisterClass *
35483586
SIRegisterInfo::getEquivalentVGPRClass(const TargetRegisterClass *SRC) const {
35493587
unsigned Size = getRegSizeInBits(*SRC);
3550-
const TargetRegisterClass *VRC = getVGPRClassForBitWidth(Size);
3588+
3589+
switch (SRC->getID()) {
3590+
default:
3591+
break;
3592+
case AMDGPU::VS_32_Lo256RegClassID:
3593+
case AMDGPU::VS_64_Lo256RegClassID:
3594+
return getAllocatableClass(getAlignedLo256VGPRClassForBitWidth(Size));
3595+
}
3596+
3597+
const TargetRegisterClass *VRC =
3598+
getAllocatableClass(getVGPRClassForBitWidth(Size));
35513599
assert(VRC && "Invalid register class size");
35523600
return VRC;
35533601
}
@@ -4005,7 +4053,12 @@ SIRegisterInfo::getSubRegAlignmentNumBits(const TargetRegisterClass *RC,
40054053
unsigned SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
40064054
const TargetRegisterClass &RC,
40074055
bool IncludeCalls) const {
4008-
for (MCPhysReg Reg : reverse(RC.getRegisters()))
4056+
unsigned NumArchVGPRs = ST.has1024AddressableVGPRs() ? 1024 : 256;
4057+
ArrayRef<MCPhysReg> Registers =
4058+
(RC.getID() == AMDGPU::VGPR_32RegClassID)
4059+
? RC.getRegisters().take_front(NumArchVGPRs)
4060+
: RC.getRegisters();
4061+
for (MCPhysReg Reg : reverse(Registers))
40094062
if (MRI.isPhysRegUsed(Reg, /*SkipRegMaskTest=*/!IncludeCalls))
40104063
return getHWRegIndex(Reg) + 1;
40114064
return 0;

llvm/lib/Target/AMDGPU/SIRegisterInfo.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -200,13 +200,14 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
200200
StringRef getRegAsmName(MCRegister Reg) const override;
201201

202202
// Pseudo regs are not allowed
203-
unsigned getHWRegIndex(MCRegister Reg) const {
204-
return getEncodingValue(Reg) & 0xff;
205-
}
203+
unsigned getHWRegIndex(MCRegister Reg) const;
206204

207205
LLVM_READONLY
208206
const TargetRegisterClass *getVGPRClassForBitWidth(unsigned BitWidth) const;
209207

208+
LLVM_READONLY const TargetRegisterClass *
209+
getAlignedLo256VGPRClassForBitWidth(unsigned BitWidth) const;
210+
210211
LLVM_READONLY
211212
const TargetRegisterClass *getAGPRClassForBitWidth(unsigned BitWidth) const;
212213

0 commit comments

Comments
 (0)