Skip to content

Commit dddeb07

Browse files
rampitecshiltian
andauthored
[AMDGPU] Restrict packed math FP32 instructions to read only one SGPR per operand on gfx12+ (#152465)
Sec. 4.6.7.1 of the gfx1250 SPG states that if an SGPR is used as an operand, only one SGPR will be read for both the low and high operations. As a result, the corresponding bits in `op_sel` and `op_sel_hi` must be the same when the operand is an SGPR. Co-authored-by: Tian, Shilei <[email protected]> Co-authored-by: Tian, Shilei <[email protected]>
1 parent cb2d56c commit dddeb07

File tree

7 files changed

+753
-371
lines changed

7 files changed

+753
-371
lines changed

llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4933,6 +4933,43 @@ bool AMDGPUAsmParser::validateOpSel(const MCInst &Inst) {
49334933
return false;
49344934
}
49354935

4936+
// Packed math FP32 instructions typically accept SGPRs or VGPRs as source
4937+
// operands. On gfx12+, if a source operand uses SGPRs, the HW can only read
4938+
// the first SGPR and use it for both the low and high operations.
4939+
if (isPackedFP32Inst(Opc) && isGFX12Plus()) {
4940+
int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
4941+
int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
4942+
int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel);
4943+
int OpSelHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel_hi);
4944+
4945+
const MCOperand &Src0 = Inst.getOperand(Src0Idx);
4946+
const MCOperand &Src1 = Inst.getOperand(Src1Idx);
4947+
unsigned OpSel = Inst.getOperand(OpSelIdx).getImm();
4948+
unsigned OpSelHi = Inst.getOperand(OpSelHiIdx).getImm();
4949+
4950+
const MCRegisterInfo *TRI = getContext().getRegisterInfo();
4951+
4952+
auto VerifyOneSGPR = [OpSel, OpSelHi](unsigned Index) -> bool {
4953+
unsigned Mask = 1U << Index;
4954+
return ((OpSel & Mask) == 0) && ((OpSelHi & Mask) == 0);
4955+
};
4956+
4957+
if (Src0.isReg() && isSGPR(Src0.getReg(), TRI) &&
4958+
!VerifyOneSGPR(/*Index=*/0))
4959+
return false;
4960+
if (Src1.isReg() && isSGPR(Src1.getReg(), TRI) &&
4961+
!VerifyOneSGPR(/*Index=*/1))
4962+
return false;
4963+
4964+
int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
4965+
if (Src2Idx != -1) {
4966+
const MCOperand &Src2 = Inst.getOperand(Src2Idx);
4967+
if (Src2.isReg() && isSGPR(Src2.getReg(), TRI) &&
4968+
!VerifyOneSGPR(/*Index=*/2))
4969+
return false;
4970+
}
4971+
}
4972+
49364973
return true;
49374974
}
49384975

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include "GCNSubtarget.h"
1919
#include "SIMachineFunctionInfo.h"
2020
#include "Utils/AMDGPUBaseInfo.h"
21+
#include "llvm/ADT/STLExtras.h"
2122
#include "llvm/Analysis/ValueTracking.h"
2223
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
2324
#include "llvm/CodeGen/LiveIntervals.h"
@@ -5534,6 +5535,15 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
55345535
}
55355536
}
55365537

5538+
// See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
5539+
// information.
5540+
if (AMDGPU::isPackedFP32Inst(Opcode) && AMDGPU::isGFX12Plus(ST)) {
5541+
for (unsigned I = 0; I < 3; ++I) {
5542+
if (!isLegalGFX12PlusPackedMathFP32Operand(MRI, MI, I))
5543+
return false;
5544+
}
5545+
}
5546+
55375547
return true;
55385548
}
55395549

@@ -6005,6 +6015,21 @@ bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx,
60056015
const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
60066016
unsigned Opc = MI.getOpcode();
60076017

6018+
// See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
6019+
// information.
6020+
if (AMDGPU::isPackedFP32Inst(MI.getOpcode()) && AMDGPU::isGFX12Plus(ST) &&
6021+
MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) {
6022+
constexpr const AMDGPU::OpName OpNames[] = {
6023+
AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};
6024+
6025+
for (auto [I, OpName] : enumerate(OpNames)) {
6026+
int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[I]);
6027+
if (static_cast<unsigned>(SrcIdx) == OpIdx &&
6028+
!isLegalGFX12PlusPackedMathFP32Operand(MRI, MI, I, &MO))
6029+
return false;
6030+
}
6031+
}
6032+
60086033
if (!isLegalRegOperand(MRI, OpInfo, MO))
60096034
return false;
60106035

@@ -6053,6 +6078,39 @@ bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
60536078
return true;
60546079
}
60556080

6081+
bool SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand(
6082+
const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
6083+
const MachineOperand *MO) const {
6084+
constexpr const unsigned NumOps = 3;
6085+
constexpr const AMDGPU::OpName OpNames[NumOps * 2] = {
6086+
AMDGPU::OpName::src0, AMDGPU::OpName::src1,
6087+
AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers,
6088+
AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers};
6089+
6090+
assert(SrcN < NumOps);
6091+
6092+
if (!MO) {
6093+
int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[SrcN]);
6094+
if (SrcIdx == -1)
6095+
return true;
6096+
MO = &MI.getOperand(SrcIdx);
6097+
}
6098+
6099+
if (!MO->isReg() || !RI.isSGPRReg(MRI, MO->getReg()))
6100+
return true;
6101+
6102+
int ModsIdx =
6103+
AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[NumOps + SrcN]);
6104+
if (ModsIdx == -1)
6105+
return true;
6106+
6107+
unsigned Mods = MI.getOperand(ModsIdx).getImm();
6108+
bool OpSel = Mods & SISrcMods::OP_SEL_0;
6109+
bool OpSelHi = Mods & SISrcMods::OP_SEL_1;
6110+
6111+
return !OpSel && !OpSelHi;
6112+
}
6113+
60566114
bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
60576115
const MachineOperand *MO) const {
60586116
const MachineFunction &MF = *MI.getParent()->getParent();
@@ -6390,6 +6448,15 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
63906448
if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
63916449
!RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
63926450
legalizeOpWithMove(MI, VOP3Idx[2]);
6451+
6452+
// Fix the register class of packed FP32 instructions on gfx12+. See
6453+
// SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more information.
6454+
if (AMDGPU::isPackedFP32Inst(Opc) && AMDGPU::isGFX12Plus(ST)) {
6455+
for (unsigned I = 0; I < 3; ++I) {
6456+
if (!isLegalGFX12PlusPackedMathFP32Operand(MRI, MI, /*SrcN=*/I))
6457+
legalizeOpWithMove(MI, VOP3Idx[I]);
6458+
}
6459+
}
63936460
}
63946461

63956462
Register SIInstrInfo::readlaneVGPRToSGPR(

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1287,6 +1287,19 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
12871287
const MachineOperand &MO) const;
12881288
bool isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx,
12891289
const MachineOperand &MO) const;
1290+
1291+
/// Check if \p MO would be a legal operand for gfx12+ packed math FP32
1292+
/// instructions. Packed math FP32 instructions typically accept SGPRs or
1293+
/// VGPRs as source operands. On gfx12+, if a source operand uses SGPRs, the
1294+
/// HW can only read the first SGPR and use it for both the low and high
1295+
/// operations.
1296+
/// \p SrcN can be 0, 1, or 2, representing src0, src1, and src2,
1297+
/// respectively. If \p MO is nullptr, the operand corresponding to SrcN will
1298+
/// be used.
1299+
bool isLegalGFX12PlusPackedMathFP32Operand(
1300+
const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
1301+
const MachineOperand *MO = nullptr) const;
1302+
12901303
/// Legalize operands in \p MI by either commuting it or inserting a
12911304
/// copy of src1.
12921305
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const;

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3318,6 +3318,20 @@ unsigned getLdsDwGranularity(const MCSubtargetInfo &ST) {
33183318
return 128;
33193319
}
33203320

3321+
bool isPackedFP32Inst(unsigned Opc) {
3322+
switch (Opc) {
3323+
case AMDGPU::V_PK_ADD_F32:
3324+
case AMDGPU::V_PK_ADD_F32_gfx12:
3325+
case AMDGPU::V_PK_MUL_F32:
3326+
case AMDGPU::V_PK_MUL_F32_gfx12:
3327+
case AMDGPU::V_PK_FMA_F32:
3328+
case AMDGPU::V_PK_FMA_F32_gfx12:
3329+
return true;
3330+
default:
3331+
return false;
3332+
}
3333+
}
3334+
33213335
} // namespace AMDGPU
33223336

33233337
raw_ostream &operator<<(raw_ostream &OS,

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1709,6 +1709,8 @@ bool isArgPassedInSGPR(const Argument *Arg);
17091709

17101710
bool isArgPassedInSGPR(const CallBase *CB, unsigned ArgNo);
17111711

1712+
LLVM_READONLY bool isPackedFP32Inst(unsigned Opc);
1713+
17121714
LLVM_READONLY
17131715
bool isLegalSMRDEncodedUnsignedOffset(const MCSubtargetInfo &ST,
17141716
int64_t EncodedOffset);

0 commit comments

Comments
 (0)