From 7c443285ec2df426ca1ff93f236fcb67d735338f Mon Sep 17 00:00:00 2001 From: Akash Dutta Date: Tue, 29 Jul 2025 15:40:34 -0500 Subject: [PATCH 01/16] initial commit --- .../Target/AMDGPU/GCNPreRAOptimizations.cpp | 409 ++++++++++++++++++ 1 file changed, 409 insertions(+) diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp index 4deb2a9485e4d..d76502d18f7e7 100644 --- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp +++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp @@ -39,6 +39,21 @@ #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/InitializePasses.h" +#include "AMDGPURegisterBankInfo.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" + +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/InitializePasses.h" +#include + +#include "GCNSchedStrategy.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/MapVector.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineScheduler.h" using namespace llvm; #define DEBUG_TYPE "amdgpu-pre-ra-optimizations" @@ -53,6 +68,17 @@ class GCNPreRAOptimizationsImpl { LiveIntervals *LIS; bool processReg(Register Reg); + bool unpackInsts(MachineFunction &MF); + bool createListOfPackedInstr(MachineInstr &BeginMI, std::unordered_set &seen); + bool isNeverCoissue(MachineInstr &MI, MachineFunction *MF) const; + bool isUnpackingSupportedInstr(MachineInstr &MI) const; + void insertMI(MachineInstr &I); + SmallVector copyToVregAndInsertMI(MachineInstr &I, + unsigned SGPRSrcPos); + SmallVector + insertUnpackedMI(MachineInstr &I, MachineOperand &DstMO, MachineOperand &LoSrcMO1, + MachineOperand &LoSrcMO2, MachineOperand &HiSrcMO1, MachineOperand &HiSrcMO2, + bool isVreg_64); public: GCNPreRAOptimizationsImpl(LiveIntervals *LS) : LIS(LS) {} @@ -62,6 +88,7 @@ class GCNPreRAOptimizationsImpl { class GCNPreRAOptimizationsLegacy : public MachineFunctionPass { public: static char ID; + const MachineLoopInfo *MLI = nullptr; GCNPreRAOptimizationsLegacy() : MachineFunctionPass(ID) { initializeGCNPreRAOptimizationsLegacyPass(*PassRegistry::getPassRegistry()); @@ -75,6 +102,7 @@ class GCNPreRAOptimizationsLegacy : public MachineFunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); + AU.addRequired(); AU.setPreservesAll(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -225,10 +253,390 @@ bool GCNPreRAOptimizationsImpl::processReg(Register Reg) { return true; } +bool GCNPreRAOptimizationsImpl::isNeverCoissue(MachineInstr &MI, MachineFunction *MF) const { + const GCNSubtarget &ST = MF->getSubtarget(); + // bool IsGFX942Only = ST.hasGFX940Insts() && !ST.hasGFX950Insts(); + // if (!IsGFX942Only) + // return false; + + if (!SIInstrInfo::isVALU(MI)){ + return false; + } + + + // V_COS, V_EXP, V_RCP, etc. + if (SIInstrInfo::isTRANS(MI)) + return true; + + // DOT2, DOT2C, DOT4, etc. + if (SIInstrInfo::isDOT(MI)) + return true; + + // MFMA, SMFMA + if (SIInstrInfo::isMFMA(MI)) + return true; + + unsigned Opcode = MI.getOpcode(); + switch (Opcode) { + case AMDGPU::V_CVT_PK_BF8_F32_e64: + case AMDGPU::V_CVT_PK_FP8_F32_e64: + case AMDGPU::V_MQSAD_PK_U16_U8_e64: + case AMDGPU::V_MQSAD_U32_U8_e64: + case AMDGPU::V_PK_ADD_F16: + case AMDGPU::V_PK_ADD_F32: + case AMDGPU::V_PK_ADD_I16: + case AMDGPU::V_PK_ADD_U16: + case AMDGPU::V_PK_ASHRREV_I16: + case AMDGPU::V_PK_FMA_F16: + case AMDGPU::V_PK_FMA_F32: + case AMDGPU::V_PK_FMAC_F16_e32: + case AMDGPU::V_PK_FMAC_F16_e64: + case AMDGPU::V_PK_LSHLREV_B16: + case AMDGPU::V_PK_LSHRREV_B16: + case AMDGPU::V_PK_MAD_I16: + case AMDGPU::V_PK_MAD_U16: + case AMDGPU::V_PK_MAX_F16: + case AMDGPU::V_PK_MAX_I16: + case AMDGPU::V_PK_MAX_U16: + case AMDGPU::V_PK_MIN_F16: + case AMDGPU::V_PK_MIN_I16: + case AMDGPU::V_PK_MIN_U16: + case AMDGPU::V_PK_MOV_B32: + case AMDGPU::V_PK_MUL_F16: + case AMDGPU::V_PK_MUL_F32: + case AMDGPU::V_PK_MUL_LO_U16: + case AMDGPU::V_PK_SUB_I16: + case AMDGPU::V_PK_SUB_U16: + case AMDGPU::V_QSAD_PK_U16_U8_e64: + return true; + + default: + return false; + + } +} + +bool GCNPreRAOptimizationsImpl::isUnpackingSupportedInstr(MachineInstr &MI) const { + unsigned Opcode = MI.getOpcode(); + switch (Opcode) { + case AMDGPU::V_PK_ADD_F16: + case AMDGPU::V_PK_ADD_F32: + case AMDGPU::V_PK_MUL_F16: + case AMDGPU::V_PK_MUL_F32: + return true; + + default: + return false; + + } +} + +SmallVector +GCNPreRAOptimizationsImpl::copyToVregAndInsertMI(MachineInstr &I, + unsigned SGPRSrcPos) { + SmallVector MIList; + + MachineBasicBlock &MBB = *I.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + MachineFunction &MF = *MBB.getParent(); + const DebugLoc &DL = I.getDebugLoc(); + + Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VReg_64_Align2RegClass); + MachineInstr *CopySGPR1 = + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY)) + .addDef(TmpReg, RegState::Undef) + .addReg(I.getOperand(SGPRSrcPos).getReg(), 0, AMDGPU::sub0); + unsigned SubIdx = TRI->composeSubRegIndices( + AMDGPU::sub0, CopySGPR1->getOperand(0).getSubReg()); + CopySGPR1->getOperand(0).setReg(CopySGPR1->getOperand(0).getReg()); + CopySGPR1->getOperand(0).setSubReg(SubIdx); + LIS->InsertMachineInstrInMaps(*CopySGPR1); + MIList.push_back(CopySGPR1); + + MachineInstr *CopySGPR2 = + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY)) + .addDef(TmpReg) + .addReg(I.getOperand(SGPRSrcPos).getReg(), 0, AMDGPU::sub1); + SubIdx = TRI->composeSubRegIndices(AMDGPU::sub1, + CopySGPR2->getOperand(0).getSubReg()); + CopySGPR2->getOperand(0).setReg(CopySGPR2->getOperand(0).getReg()); + CopySGPR2->getOperand(0).setSubReg(SubIdx); + LIS->InsertMachineInstrInMaps(*CopySGPR2); + MIList.push_back(CopySGPR2); + return MIList; +} + +bool GCNPreRAOptimizationsImpl::createListOfPackedInstr( + MachineInstr &BeginMI, std::unordered_set &seen) { + auto *BB = BeginMI.getParent(); + auto *MF = BB->getParent(); + int NumInst = 0; + + auto E = BB->end(); + auto schedModel = TII->getSchedModel(); + const MCSchedClassDesc *schedClassDesc = schedModel.resolveSchedClass(&BeginMI); + const int NumMFMACycles = schedModel.getWriteProcResBegin(schedClassDesc)->ReleaseAtCycle; + int totalCyclesBetweenCandidates = 0; + for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) { + MachineInstr &Instr = *I; + const MCSchedClassDesc *instrSchedClassDesc = schedModel.resolveSchedClass(&Instr); + totalCyclesBetweenCandidates += schedModel.getWriteProcResBegin(instrSchedClassDesc)->ReleaseAtCycle; + if (Instr.isMetaInstruction()) + continue; + + if (Instr.isTerminator()) + return false; + + if (totalCyclesBetweenCandidates > NumMFMACycles) + return false; + + if ((Instr.getOpcode() == AMDGPU::V_PK_MUL_F32) && isNeverCoissue(Instr, Instr.getParent()->getParent())) { + totalCyclesBetweenCandidates += 1; + seen.insert(&Instr); + } + } + return true; +} + +SmallVector GCNPreRAOptimizationsImpl::insertUnpackedMI( + MachineInstr &I, MachineOperand &DstMO, MachineOperand &LoSrcMO1, MachineOperand &LoSrcMO2, + MachineOperand &HiSrcMO1, MachineOperand &HiSrcMO2, bool isVreg_64) { + + SmallVector MIList; + MachineBasicBlock &MBB = *I.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + MachineFunction &MF = *MBB.getParent(); + const DebugLoc &DL = I.getDebugLoc(); + Register DstReg = DstMO.getReg(); + + unsigned SrcSubIdx1 = + TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub0); + unsigned SrcSubIdx2 = + TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub0); + unsigned DestSubIdx = + TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub0); + + const MCInstrDesc instrDesc = I.getDesc(); + + int clampIdx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::clamp); + int64_t clampVal = I.getOperand(clampIdx).getImm(); + + int src0_modifiers_Idx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src0_modifiers); + int src1_modifiers_Idx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src1_modifiers); + unsigned src0_Mods = I.getOperand(src0_modifiers_Idx).getImm(); + unsigned src1_Mods = I.getOperand(src1_modifiers_Idx).getImm(); + + //don't worry about abs values. Packed instructions (VOP3P) do not support them + unsigned Lo_src0_mods = 0; + unsigned Lo_src1_mods = 0; + + MachineInstrBuilder Op0L_Op1L = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MUL_F32_e64)); + Op0L_Op1L.addDef(DstReg, 0, DestSubIdx); //vdst + if (src0_Mods & SISrcMods::OP_SEL_0) { + if (src0_Mods & SISrcMods::NEG) { + Lo_src0_mods |= SISrcMods::NEG; + } + Op0L_Op1L.addImm(Lo_src0_mods); //src0_modifiers + unsigned Src0SubIdx = TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub1); + Op0L_Op1L.addReg(LoSrcMO1.getReg(), 0, Src0SubIdx); //src0 + } + else { + Op0L_Op1L.addImm(Lo_src0_mods); //src0_modifiers + unsigned Src0SubIdx = TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub0); + Op0L_Op1L.addReg(LoSrcMO1.getReg(), 0, Src0SubIdx); //src0 //if op_sel == 0, select register 0 of reg:sub0_sub1 + } + + if (src1_Mods & SISrcMods::OP_SEL_0) { + if (src1_Mods & SISrcMods::NEG) { + Lo_src1_mods |= SISrcMods::NEG; + } + Op0L_Op1L.addImm(Lo_src1_mods); //src0_modifiers + unsigned Src1SubIdx = TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub1); + Op0L_Op1L.addReg(LoSrcMO2.getReg(), 0, Src1SubIdx); //src0 + } + else { + Op0L_Op1L.addImm(Lo_src1_mods); //src0_modifiers + unsigned Src1SubIdx = TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub0); + Op0L_Op1L.addReg(LoSrcMO2.getReg(), 0, Src1SubIdx); //src0 //if op_sel_hi == 0, select register 0 of reg:sub0_sub1 + } + Op0L_Op1L.addImm(clampVal); //clamp + //packed instructions do not support output modifiers. safe to assign them 0 for this use case + Op0L_Op1L.addImm(0); //omod + + if (isVreg_64) { + Op0L_Op1L->getOperand(0).setIsUndef(); + } + else { + if (I.getOperand(0).isUndef()) { + Op0L_Op1L->getOperand(0).setIsUndef(); + } + } + + LIS->InsertMachineInstrInMaps(*Op0L_Op1L); + + SrcSubIdx1 = + TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub1); + SrcSubIdx2 = + TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub1); + DestSubIdx = + TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub1); + + //don't worry about abs values. Packed instructions (VOP3P) do not support them + unsigned Hi_src0_mods = 0; + unsigned Hi_src1_mods = 0; + + MachineInstrBuilder Op0H_Op1H = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MUL_F32_e64)); + Op0H_Op1H.addDef(DstReg, 0, DestSubIdx); //vdst + if (src0_Mods & SISrcMods::OP_SEL_1) { + if (src0_Mods & SISrcMods::NEG_HI) { + Hi_src0_mods |= SISrcMods::NEG; + } + Op0H_Op1H.addImm(Hi_src0_mods); //src0_modifiers + unsigned Src0SubIdx = TRI->composeSubRegIndices(HiSrcMO1.getSubReg(), AMDGPU::sub1); + Op0H_Op1H.addReg(HiSrcMO1.getReg(), 0, Src0SubIdx); //src0 + } + else { + Op0H_Op1H.addImm(Hi_src0_mods); //src0_modifiers + unsigned Src0SubIdx = TRI->composeSubRegIndices(HiSrcMO1.getSubReg(), AMDGPU::sub0); + Op0H_Op1H.addReg(HiSrcMO1.getReg(), 0, Src0SubIdx); //src0 //if op_sel_hi == 0, select register 0 of reg:sub0_sub1 + } + + if (src1_Mods & SISrcMods::OP_SEL_1) { + if (src1_Mods & SISrcMods::NEG_HI) { + Hi_src1_mods |= SISrcMods::NEG; + } + Op0H_Op1H.addImm(Hi_src1_mods); //src0_modifiers + unsigned Src1SubIdx = TRI->composeSubRegIndices(HiSrcMO2.getSubReg(), AMDGPU::sub1); + Op0H_Op1H.addReg(HiSrcMO2.getReg(), 0, Src1SubIdx); //src0 + } + else { + Op0H_Op1H.addImm(Hi_src1_mods); //src0_modifiers + unsigned Src1SubIdx = TRI->composeSubRegIndices(HiSrcMO2.getSubReg(), AMDGPU::sub0); + Op0H_Op1H.addReg(HiSrcMO2.getReg(), 0, Src1SubIdx); //src0 //if op_sel_hi == 0, select register 0 of reg:sub0_sub1 + } + Op0H_Op1H.addImm(clampVal); //clamp + //packed instructions do not support output modifiers. safe to assign them 0 for this use case + Op0H_Op1H.addImm(0); //omod + LIS->InsertMachineInstrInMaps(*Op0H_Op1H); + + if (I.getFlag(MachineInstr::MIFlag::NoFPExcept)) { + Op0L_Op1L->setFlag(MachineInstr::MIFlag::NoFPExcept); + Op0H_Op1H->setFlag(MachineInstr::MIFlag::NoFPExcept); + } + LIS->RemoveMachineInstrFromMaps(I); + I.eraseFromParent(); + LIS->removeInterval(DstReg); + LIS->createAndComputeVirtRegInterval(DstReg); + MIList.push_back(Op0L_Op1L); + MIList.push_back(Op0H_Op1H); + return MIList; +} + +void GCNPreRAOptimizationsImpl::insertMI(MachineInstr &I) { + MachineBasicBlock &MBB = *I.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + MachineFunction &MF = *MBB.getParent(); + + Register DstReg = I.getOperand(0).getReg(); + Register SrcReg1 = I.getOperand(2).getReg(); + Register SrcReg2 = I.getOperand(4).getReg(); + + MachineOperand &DstMO = I.getOperand(0); + MachineOperand &SrcMO1 = I.getOperand(2); + MachineOperand &SrcMO2 = I.getOperand(4); + + MachineBasicBlock::iterator MII = I; + const DebugLoc &DL = I.getDebugLoc(); + const TargetRegisterClass *DstRC = MRI.getRegClass(I.getOperand(0).getReg()); + const TargetRegisterClass *Src0RC = MRI.getRegClass(I.getOperand(2).getReg()); + const TargetRegisterClass *Src1RC = MRI.getRegClass(I.getOperand(4).getReg()); + const TargetRegisterClass *Src0SubRC = + TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0); + const TargetRegisterClass *SrcRC = TRI->getSubClassWithSubReg(Src0RC, 1); + + if ((Src1RC->getID() == AMDGPU::SGPR_64RegClassID) || + (Src0RC->getID() == AMDGPU::SGPR_64RegClassID)) { + if (Src1RC->getID() == AMDGPU::SGPR_64RegClassID) { + // try with sgpr32 + SmallVector copyInstrs = copyToVregAndInsertMI(I, 4); + MachineInstr *CopySGPR1 = copyInstrs[0]; + MachineInstr *CopySGPR2 = copyInstrs[1]; + + if (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID) { + SmallVector unpackedInstrs = insertUnpackedMI( + I, DstMO, SrcMO1, CopySGPR1->getOperand(0), SrcMO1, + CopySGPR2->getOperand(0), true); + unpackedInstrs[0]->addRegisterKilled(unpackedInstrs[0]->getOperand(2).getReg(), TRI); + unpackedInstrs[1]->addRegisterKilled(unpackedInstrs[1]->getOperand(2).getReg(), TRI); + } else { + SmallVector unpackedInstrs = insertUnpackedMI( + I, DstMO, SrcMO1, CopySGPR1->getOperand(0), SrcMO1, + CopySGPR2->getOperand(0), false); + unpackedInstrs[0]->addRegisterKilled(unpackedInstrs[0]->getOperand(2).getReg(), TRI); + unpackedInstrs[1]->addRegisterKilled(unpackedInstrs[1]->getOperand(2).getReg(), TRI); + } + } + else { + SmallVector copyInstrs = copyToVregAndInsertMI(I, 2); + MachineInstr *CopySGPR1 = copyInstrs[0]; + MachineInstr *CopySGPR2 = copyInstrs[1]; + + if (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID) { + SmallVector unpackedInstrs = insertUnpackedMI( + I, DstMO, CopySGPR1->getOperand(0), SrcMO2, CopySGPR2->getOperand(0), SrcMO2, true); + unpackedInstrs[0]->addRegisterKilled(unpackedInstrs[0]->getOperand(1).getReg(), TRI); + unpackedInstrs[1]->addRegisterKilled(unpackedInstrs[1]->getOperand(1).getReg(), TRI); + } else { + SmallVector unpackedInstrs = insertUnpackedMI( + I, DstMO, CopySGPR1->getOperand(0), SrcMO2, CopySGPR2->getOperand(0), SrcMO2, false); + unpackedInstrs[0]->addRegisterKilled(unpackedInstrs[0]->getOperand(1).getReg(), TRI); + unpackedInstrs[1]->addRegisterKilled(unpackedInstrs[1]->getOperand(1).getReg(), TRI); + } + } + return; + } + + if (DstRC->getID() == AMDGPU::VReg_512_Align2RegClassID) { + SmallVector unpackedInstrs = insertUnpackedMI( + I, DstMO, SrcMO1, SrcMO2, SrcMO1, + SrcMO2, false); + } + else if (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID) { + SmallVector unpackedInstrs = insertUnpackedMI( + I, DstMO, SrcMO1, SrcMO2, SrcMO1, + SrcMO2, true); + } + return; +} + +bool GCNPreRAOptimizationsImpl::unpackInsts(MachineFunction &MF) { + + const GCNSubtarget &ST = MF.getSubtarget(); + TII = ST.getInstrInfo(); + TRI = &TII->getRegisterInfo(); + + auto schedModel = TII->getSchedModel(); + for (MachineBasicBlock &MBB : MF) { + std::unordered_set seen; + for (MachineInstr &MI : MBB) { + if (SIInstrInfo::isMFMA(MI)){ + createListOfPackedInstr(MI, seen); + } + + } + if (!seen.empty()) { + for (MachineInstr *MI : seen) + insertMI(*MI); + } + } + return true; +} + bool GCNPreRAOptimizationsLegacy::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; LiveIntervals *LIS = &getAnalysis().getLIS(); + MLI = &getAnalysis().getLI(); return GCNPreRAOptimizationsImpl(LIS).run(MF); } @@ -248,6 +656,7 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) { bool Changed = false; + Changed = unpackInsts(MF); for (unsigned I = 0, E = MRI->getNumVirtRegs(); I != E; ++I) { Register Reg = Register::index2VirtReg(I); if (!LIS->hasInterval(Reg)) From 4bff9657e7016452f6657f1c217e804fd354d3ae Mon Sep 17 00:00:00 2001 From: Akash Dutta Date: Tue, 29 Jul 2025 15:44:14 -0500 Subject: [PATCH 02/16] add test --- ...unpack-non-coissue-insts-post-scheduler.ll | 116 ++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.ll diff --git a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.ll b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.ll new file mode 100644 index 0000000000000..5c6d376c92e65 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.ll @@ -0,0 +1,116 @@ +; TODO: change variable names. Make test smaller if possible + +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" +target triple = "amdgcn-amd-amdhsa" + +@global_smem = external addrspace(3) global [0 x i8], align 16 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.amdgcn.exp2.f32(float) + +; Function Attrs: nofree norecurse nounwind +define amdgpu_kernel void @attn_fwd(ptr addrspace(1) inreg readonly captures(none) %0, ptr addrspace(1) inreg readonly captures(none) %1, ptr addrspace(1) inreg readonly captures(none) %2, ptr addrspace(1) inreg writeonly captures(none) %3, ptr addrspace(1) inreg writeonly captures(none) %4, i32 inreg %5, i32 inreg %6, i32 inreg %7, i32 inreg %8, i32 inreg %9, i32 inreg %10, i32 inreg %11, i32 inreg %12, i32 inreg %13, i32 inreg %14, i32 inreg %15, i32 inreg %16, i32 inreg %17, i32 inreg %18, i32 inreg %19, i32 inreg %20, i32 inreg %21, i32 inreg %22, float inreg %23, i32 inreg %24, ptr addrspace(1) inreg readnone captures(none) %25, i32 inreg %26, ptr addrspace(1) inreg readnone captures(none) %27) local_unnamed_addr { + %29 = tail call i32 @llvm.amdgcn.workgroup.id.x() + + %96 = sext i32 %8 to i64 + %97 = getelementptr half, ptr addrspace(1) %1, i64 %96 + + %115 = icmp slt i32 %29, 16384 + + %135 = icmp slt i32 %29, 1 + + %215 = getelementptr half, ptr addrspace(3) @global_smem, i32 %29 + %216 = load <8 x half>, ptr addrspace(3) %215, align 16 + + %276 = shl nuw nsw i32 %29, 7 + + %396 = getelementptr half, ptr addrspace(1) %97, i64 1 + %397 = sext i32 %13 to i64 + %398 = getelementptr half, ptr addrspace(1) %97, i64 %397 + + %536 = fsub float 0xFFF0000000000000, 0.5 + %537 = tail call float @llvm.amdgcn.exp2.f32(float %536) + + %538 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %29 + %539 = load <8 x half>, ptr addrspace(3) %538, align 16 + + %573 = icmp ult i32 1, 511 + br i1 %573, label %575, label %574 + +574: ; preds = %28 + br label %575 + +575: ; preds = %574, %28 + %610 = shufflevector <8 x half> %539, <8 x half> poison, <2 x i32> + + br label %686 + +686: ; preds = %575, %686 + %.pn347561 = phi float [ %537, %575 ], [ %1329, %686 ] + + + %690 = phi i32 [ 0, %575 ], [ %1120, %686 ] + %691 = phi ptr addrspace(1) [ %398, %575 ], [ %1117, %686 ] + %692 = phi ptr addrspace(1) [ %396, %575 ], [ %1116, %686 ] + + %695 = phi <2 x half> [ %610, %575 ], [ %1414, %686 ] + + + %759 = phi <2 x float> [ zeroinitializer, %575 ], [ %1478, %686 ] + %760 = phi <2 x float> [ zeroinitializer, %575 ], [ %1478, %686 ] + + %tmp6 = phi <2 x float> [ zeroinitializer, %575 ], [ %tmp5, %686 ] + %tmp7 = phi <2 x float> [ zeroinitializer, %575 ], [ %tmp5, %686 ] + + %871 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %216, <8 x half> %216, <16 x float> zeroinitializer, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.s.setprio(i16 0) + %872 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %216, <8 x half> %216, <16 x float> %871, i32 0, i32 0, i32 0) + %879 = extractelement <16 x float> %872, i64 0 + + + %957 = insertelement <2 x float> poison, float %.pn347561, i64 0 + %958 = shufflevector <2 x float> %957, <2 x float> poison, <2 x i32> zeroinitializer + %959 = fmul <2 x float> %759, %958 + %960 = fmul <2 x float> %760, %958 + + %tmp1 = fmul <2 x float> %tmp6, %958 + %tmp2 = fmul <2 x float> %tmp7, %958 + + %1048 = shufflevector <2 x half> %695, <2 x half> poison, <8 x i32> + + %1116 = getelementptr half, ptr addrspace(1) %692, i64 1 + %1117 = getelementptr half, ptr addrspace(1) %691, i64 %397 + + %1119 = icmp slt i32 %690, 2 + %1120 = select i1 %1119, i32 %690, i32 0 + %.idx359 = shl i32 %1120, 14 + %1121 = getelementptr i8, ptr addrspace(3) @global_smem, i32 %.idx359 + + %1140 = shufflevector <8 x half> %1048, <8 x half> %1048, <8 x i32> + + %1157 = shufflevector <2 x float> %959, <2 x float> %960, <16 x i32> + %tmp3 = shufflevector <2 x float> %tmp1, <2 x float> %tmp2, <16 x i32> + + %1173 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1048, <8 x half> %1140, <16 x float> %1157, i32 0, i32 0, i32 0) + %tmp4 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1048, <8 x half> %1140, <16 x float> %tmp3, i32 0, i32 0, i32 0) + + + %1329 = tail call float @llvm.amdgcn.exp2.f32(float %879) + + %.idx367 = shl i32 %690, 14 + %1404 = getelementptr i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.idx367 + + %1412 = add nuw nsw i32 0, 64 + %1413 = icmp samesign ult i32 0, 7936 + %1414 = shufflevector <8 x half> %1140, <8 x half> poison, <2 x i32> + + %1478 = shufflevector <16 x float> %1173, <16 x float> poison, <2 x i32> + %tmp5 = shufflevector <16 x float> %tmp4, <16 x float> poison, <2 x i32> + + br i1 %1413, label %686, label %1510 + +1510: ; preds = %686 + ret void +} From d3b19c668d30e4dc906a301c13d2cf6a2e434c7a Mon Sep 17 00:00:00 2001 From: Akash Dutta Date: Tue, 29 Jul 2025 16:31:30 -0500 Subject: [PATCH 03/16] code cleanup --- llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp index d76502d18f7e7..e2c65bf25d31c 100644 --- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp +++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp @@ -39,19 +39,12 @@ #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/InitializePasses.h" -#include "AMDGPURegisterBankInfo.h" #include "SIInstrInfo.h" -#include "SIMachineFunctionInfo.h" - -#include "llvm/CodeGen/MachineDominators.h" -#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/InitializePasses.h" #include #include "GCNSchedStrategy.h" -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/MapVector.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineScheduler.h" using namespace llvm; @@ -88,7 +81,6 @@ class GCNPreRAOptimizationsImpl { class GCNPreRAOptimizationsLegacy : public MachineFunctionPass { public: static char ID; - const MachineLoopInfo *MLI = nullptr; GCNPreRAOptimizationsLegacy() : MachineFunctionPass(ID) { initializeGCNPreRAOptimizationsLegacyPass(*PassRegistry::getPassRegistry()); @@ -102,7 +94,6 @@ class GCNPreRAOptimizationsLegacy : public MachineFunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); - AU.addRequired(); AU.setPreservesAll(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -636,7 +627,6 @@ bool GCNPreRAOptimizationsLegacy::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; LiveIntervals *LIS = &getAnalysis().getLIS(); - MLI = &getAnalysis().getLI(); return GCNPreRAOptimizationsImpl(LIS).run(MF); } From c581612e5cd376b5ee6ef19626444dec25e077d6 Mon Sep 17 00:00:00 2001 From: Akash Dutta Date: Thu, 31 Jul 2025 20:02:28 -0500 Subject: [PATCH 04/16] miscellaneous code optimizations and cleanup --- .../Target/AMDGPU/GCNPreRAOptimizations.cpp | 201 ++++++------------ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 59 ++++- llvm/lib/Target/AMDGPU/SIInstrInfo.h | 1 + 3 files changed, 127 insertions(+), 134 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp index e2c65bf25d31c..844fc1439099f 100644 --- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp +++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp @@ -28,6 +28,12 @@ /// and a VGPR_16. If we use the VGPR_16 that corresponds to the lo16 bits of /// the VGPR_32, the COPY can be completely eliminated. /// +/// Additionally, this pass also unpacks packed instructions (V_PK_MUL_F32 and V_PK_ADD_F32) +/// adjacent to MFMAs such that they can be co-issued. +/// This helps with overlapping MFMA and certain vector instructions in machine schedules +/// and is expected to improve performance. +/// Only those packed instructions are unpacked that are overlapped by the MFMA latency. +/// Rest should remain untouched. //===----------------------------------------------------------------------===// #include "GCNPreRAOptimizations.h" @@ -38,12 +44,10 @@ #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/InitializePasses.h" - +#include "llvm/ADT/DenseSet.h" #include "SIInstrInfo.h" #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/InitializePasses.h" -#include - #include "GCNSchedStrategy.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineScheduler.h" @@ -61,11 +65,10 @@ class GCNPreRAOptimizationsImpl { LiveIntervals *LIS; bool processReg(Register Reg); - bool unpackInsts(MachineFunction &MF); - bool createListOfPackedInstr(MachineInstr &BeginMI, std::unordered_set &seen); - bool isNeverCoissue(MachineInstr &MI, MachineFunction *MF) const; + bool createListOfPackedInstr(MachineInstr &BeginMI, DenseSet &instrsToUnpack); bool isUnpackingSupportedInstr(MachineInstr &MI) const; void insertMI(MachineInstr &I); + uint16_t mapToUnpackedOpcode(MachineInstr &I); SmallVector copyToVregAndInsertMI(MachineInstr &I, unsigned SGPRSrcPos); SmallVector @@ -244,80 +247,28 @@ bool GCNPreRAOptimizationsImpl::processReg(Register Reg) { return true; } -bool GCNPreRAOptimizationsImpl::isNeverCoissue(MachineInstr &MI, MachineFunction *MF) const { - const GCNSubtarget &ST = MF->getSubtarget(); - // bool IsGFX942Only = ST.hasGFX940Insts() && !ST.hasGFX950Insts(); - // if (!IsGFX942Only) - // return false; - - if (!SIInstrInfo::isVALU(MI)){ - return false; - } - - - // V_COS, V_EXP, V_RCP, etc. - if (SIInstrInfo::isTRANS(MI)) - return true; - - // DOT2, DOT2C, DOT4, etc. - if (SIInstrInfo::isDOT(MI)) - return true; - - // MFMA, SMFMA - if (SIInstrInfo::isMFMA(MI)) - return true; - +bool GCNPreRAOptimizationsImpl::isUnpackingSupportedInstr(MachineInstr &MI) const { unsigned Opcode = MI.getOpcode(); switch (Opcode) { - case AMDGPU::V_CVT_PK_BF8_F32_e64: - case AMDGPU::V_CVT_PK_FP8_F32_e64: - case AMDGPU::V_MQSAD_PK_U16_U8_e64: - case AMDGPU::V_MQSAD_U32_U8_e64: - case AMDGPU::V_PK_ADD_F16: - case AMDGPU::V_PK_ADD_F32: - case AMDGPU::V_PK_ADD_I16: - case AMDGPU::V_PK_ADD_U16: - case AMDGPU::V_PK_ASHRREV_I16: - case AMDGPU::V_PK_FMA_F16: - case AMDGPU::V_PK_FMA_F32: - case AMDGPU::V_PK_FMAC_F16_e32: - case AMDGPU::V_PK_FMAC_F16_e64: - case AMDGPU::V_PK_LSHLREV_B16: - case AMDGPU::V_PK_LSHRREV_B16: - case AMDGPU::V_PK_MAD_I16: - case AMDGPU::V_PK_MAD_U16: - case AMDGPU::V_PK_MAX_F16: - case AMDGPU::V_PK_MAX_I16: - case AMDGPU::V_PK_MAX_U16: - case AMDGPU::V_PK_MIN_F16: - case AMDGPU::V_PK_MIN_I16: - case AMDGPU::V_PK_MIN_U16: - case AMDGPU::V_PK_MOV_B32: - case AMDGPU::V_PK_MUL_F16: - case AMDGPU::V_PK_MUL_F32: - case AMDGPU::V_PK_MUL_LO_U16: - case AMDGPU::V_PK_SUB_I16: - case AMDGPU::V_PK_SUB_U16: - case AMDGPU::V_QSAD_PK_U16_U8_e64: - return true; - - default: - return false; + case AMDGPU::V_PK_ADD_F32: + case AMDGPU::V_PK_MUL_F32: + return true; + + default: + return false; } } -bool GCNPreRAOptimizationsImpl::isUnpackingSupportedInstr(MachineInstr &MI) const { - unsigned Opcode = MI.getOpcode(); +uint16_t GCNPreRAOptimizationsImpl::mapToUnpackedOpcode(MachineInstr &I) { + unsigned Opcode = I.getOpcode(); switch (Opcode) { - case AMDGPU::V_PK_ADD_F16: - case AMDGPU::V_PK_ADD_F32: - case AMDGPU::V_PK_MUL_F16: - case AMDGPU::V_PK_MUL_F32: - return true; - - default: - return false; + case AMDGPU::V_PK_ADD_F32: + return AMDGPU::V_ADD_F32_e64; + case AMDGPU::V_PK_MUL_F32: + return AMDGPU::V_MUL_F32_e64; + default: + return std::numeric_limits::max(); } } @@ -358,7 +309,7 @@ GCNPreRAOptimizationsImpl::copyToVregAndInsertMI(MachineInstr &I, } bool GCNPreRAOptimizationsImpl::createListOfPackedInstr( - MachineInstr &BeginMI, std::unordered_set &seen) { + MachineInstr &BeginMI, DenseSet &instrsToUnpack) { auto *BB = BeginMI.getParent(); auto *MF = BB->getParent(); int NumInst = 0; @@ -377,13 +328,13 @@ bool GCNPreRAOptimizationsImpl::createListOfPackedInstr( if (Instr.isTerminator()) return false; - + if (totalCyclesBetweenCandidates > NumMFMACycles) return false; - if ((Instr.getOpcode() == AMDGPU::V_PK_MUL_F32) && isNeverCoissue(Instr, Instr.getParent()->getParent())) { + if ((isUnpackingSupportedInstr(Instr)) && TII->isNeverCoissue(Instr)) { totalCyclesBetweenCandidates += 1; - seen.insert(&Instr); + instrsToUnpack.insert(&Instr); } } return true; @@ -420,8 +371,8 @@ SmallVector GCNPreRAOptimizationsImpl::insertUnpackedMI( //don't worry about abs values. Packed instructions (VOP3P) do not support them unsigned Lo_src0_mods = 0; unsigned Lo_src1_mods = 0; - - MachineInstrBuilder Op0L_Op1L = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MUL_F32_e64)); + uint16_t unpackedOpcode = mapToUnpackedOpcode(I); + MachineInstrBuilder Op0L_Op1L = BuildMI(MBB, I, DL, TII->get(unpackedOpcode)); Op0L_Op1L.addDef(DstReg, 0, DestSubIdx); //vdst if (src0_Mods & SISrcMods::OP_SEL_0) { if (src0_Mods & SISrcMods::NEG) { @@ -476,7 +427,7 @@ SmallVector GCNPreRAOptimizationsImpl::insertUnpackedMI( unsigned Hi_src0_mods = 0; unsigned Hi_src1_mods = 0; - MachineInstrBuilder Op0H_Op1H = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MUL_F32_e64)); + MachineInstrBuilder Op0H_Op1H = BuildMI(MBB, I, DL, TII->get(unpackedOpcode)); Op0H_Op1H.addDef(DstReg, 0, DestSubIdx); //vdst if (src0_Mods & SISrcMods::OP_SEL_1) { if (src0_Mods & SISrcMods::NEG_HI) { @@ -600,29 +551,6 @@ void GCNPreRAOptimizationsImpl::insertMI(MachineInstr &I) { return; } -bool GCNPreRAOptimizationsImpl::unpackInsts(MachineFunction &MF) { - - const GCNSubtarget &ST = MF.getSubtarget(); - TII = ST.getInstrInfo(); - TRI = &TII->getRegisterInfo(); - - auto schedModel = TII->getSchedModel(); - for (MachineBasicBlock &MBB : MF) { - std::unordered_set seen; - for (MachineInstr &MI : MBB) { - if (SIInstrInfo::isMFMA(MI)){ - createListOfPackedInstr(MI, seen); - } - - } - if (!seen.empty()) { - for (MachineInstr *MI : seen) - insertMI(*MI); - } - } - return true; -} - bool GCNPreRAOptimizationsLegacy::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; @@ -646,7 +574,6 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) { bool Changed = false; - Changed = unpackInsts(MF); for (unsigned I = 0, E = MRI->getNumVirtRegs(); I != E; ++I) { Register Reg = Register::index2VirtReg(I); if (!LIS->hasInterval(Reg)) @@ -659,38 +586,46 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) { Changed |= processReg(Reg); } - if (!ST.useRealTrue16Insts()) - return Changed; - // Add RA hints to improve True16 COPY elimination. - for (const MachineBasicBlock &MBB : MF) { - for (const MachineInstr &MI : MBB) { - if (MI.getOpcode() != AMDGPU::COPY) - continue; - Register Dst = MI.getOperand(0).getReg(); - Register Src = MI.getOperand(1).getReg(); - if (Dst.isVirtual() && - MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass && - Src.isPhysical() && - TRI->getRegClassForReg(*MRI, Src) == &AMDGPU::VGPR_32RegClass) - MRI->setRegAllocationHint(Dst, 0, TRI->getSubReg(Src, AMDGPU::lo16)); - if (Src.isVirtual() && - MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass && - Dst.isPhysical() && - TRI->getRegClassForReg(*MRI, Dst) == &AMDGPU::VGPR_32RegClass) - MRI->setRegAllocationHint(Src, 0, TRI->getSubReg(Dst, AMDGPU::lo16)); - if (!Dst.isVirtual() || !Src.isVirtual()) - continue; - if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_32RegClass && - MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass) { - MRI->setRegAllocationHint(Dst, AMDGPURI::Size32, Src); - MRI->setRegAllocationHint(Src, AMDGPURI::Size16, Dst); + // Unpack packed instructions to overlap MFMAs. This allows the compiler to co-issue unpacked instructions with MFMA + for (MachineBasicBlock &MBB : MF) { + DenseSet instrsToUnpack; + for (MachineInstr &MI : MBB) { + if (SIInstrInfo::isMFMA(MI)){ + createListOfPackedInstr(MI, instrsToUnpack); + } + if (ST.useRealTrue16Insts()){ + if (MI.getOpcode() != AMDGPU::COPY) + continue; + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); + if (Dst.isVirtual() && + MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass && + Src.isPhysical() && + TRI->getRegClassForReg(*MRI, Src) == &AMDGPU::VGPR_32RegClass) + MRI->setRegAllocationHint(Dst, 0, TRI->getSubReg(Src, AMDGPU::lo16)); + if (Src.isVirtual() && + MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass && + Dst.isPhysical() && + TRI->getRegClassForReg(*MRI, Dst) == &AMDGPU::VGPR_32RegClass) + MRI->setRegAllocationHint(Src, 0, TRI->getSubReg(Dst, AMDGPU::lo16)); + if (!Dst.isVirtual() || !Src.isVirtual()) + continue; + if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_32RegClass && + MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass) { + MRI->setRegAllocationHint(Dst, AMDGPURI::Size32, Src); + MRI->setRegAllocationHint(Src, AMDGPURI::Size16, Dst); + } + if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass && + MRI->getRegClass(Src) == &AMDGPU::VGPR_32RegClass) + MRI->setRegAllocationHint(Dst, AMDGPURI::Size16, Src); } - if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass && - MRI->getRegClass(Src) == &AMDGPU::VGPR_32RegClass) - MRI->setRegAllocationHint(Dst, AMDGPURI::Size16, Src); + } + + if (!instrsToUnpack.empty()) { + for (MachineInstr *MI : instrsToUnpack) + insertMI(*MI); } } - return Changed; } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index c2da937552240..5562ff590b71d 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -15,7 +15,6 @@ #include "AMDGPU.h" #include "AMDGPUInstrInfo.h" #include "GCNHazardRecognizer.h" -#include "GCNSubtarget.h" #include "SIMachineFunctionInfo.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/Analysis/ValueTracking.h" @@ -6173,6 +6172,64 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, return isImmOperandLegal(MI, OpIdx, *MO); } +bool SIInstrInfo::isNeverCoissue(MachineInstr &MI) const { + bool IsGFX950Only = ST.hasGFX950Insts(); + if (!IsGFX950Only) + return false; + + if (!isVALU(MI)) + return false; + + // V_COS, V_EXP, V_RCP, etc. + if (isTRANS(MI)) + return true; + + // DOT2, DOT2C, DOT4, etc. + if (isDOT(MI)) + return true; + + // MFMA, SMFMA + if (isMFMA(MI)) + return true; + + unsigned Opcode = MI.getOpcode(); + switch (Opcode) { + case AMDGPU::V_CVT_PK_BF8_F32_e64: + case AMDGPU::V_CVT_PK_FP8_F32_e64: + case AMDGPU::V_MQSAD_PK_U16_U8_e64: + case AMDGPU::V_MQSAD_U32_U8_e64: + case AMDGPU::V_PK_ADD_F16: + case AMDGPU::V_PK_ADD_F32: + case AMDGPU::V_PK_ADD_I16: + case AMDGPU::V_PK_ADD_U16: + case AMDGPU::V_PK_ASHRREV_I16: + case AMDGPU::V_PK_FMA_F16: + case AMDGPU::V_PK_FMA_F32: + case AMDGPU::V_PK_FMAC_F16_e32: + case AMDGPU::V_PK_FMAC_F16_e64: + case AMDGPU::V_PK_LSHLREV_B16: + case AMDGPU::V_PK_LSHRREV_B16: + case AMDGPU::V_PK_MAD_I16: + case AMDGPU::V_PK_MAD_U16: + case AMDGPU::V_PK_MAX_F16: + case AMDGPU::V_PK_MAX_I16: + case AMDGPU::V_PK_MAX_U16: + case AMDGPU::V_PK_MIN_F16: + case AMDGPU::V_PK_MIN_I16: + case AMDGPU::V_PK_MIN_U16: + case AMDGPU::V_PK_MOV_B32: + case AMDGPU::V_PK_MUL_F16: + case AMDGPU::V_PK_MUL_F32: + case AMDGPU::V_PK_MUL_LO_U16: + case AMDGPU::V_PK_SUB_I16: + case AMDGPU::V_PK_SUB_U16: + case AMDGPU::V_QSAD_PK_U16_U8_e64: + return true; + default: + return false; + } +} + void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const { unsigned Opc = MI.getOpcode(); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index e042b59eb0f04..b7a0388470279 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1178,6 +1178,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, const MachineOperand &MO) const; + bool isNeverCoissue(MachineInstr &MI) const; /// Return true if this 64-bit VALU instruction has a 32-bit encoding. /// This function will return false if you pass it a 32-bit instruction. bool hasVALU32BitEncoding(unsigned Opcode) const; From c695b99ddae061127e015daf523b8eeec7888b71 Mon Sep 17 00:00:00 2001 From: Akash Dutta Date: Fri, 1 Aug 2025 09:14:29 -0500 Subject: [PATCH 05/16] add code comments --- llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp index 844fc1439099f..0f7009a6ea394 100644 --- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp +++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp @@ -262,6 +262,9 @@ bool GCNPreRAOptimizationsImpl::isUnpackingSupportedInstr(MachineInstr &MI) cons uint16_t GCNPreRAOptimizationsImpl::mapToUnpackedOpcode(MachineInstr &I) { unsigned Opcode = I.getOpcode(); + // use 64 bit encoding to allow use of VOP3 instructions. + // VOP3 instructions allow VOP3P source modifiers to be translated to VOP3 + // e32 instructions are VOP2 and don't allow source modifiers switch (Opcode) { case AMDGPU::V_PK_ADD_F32: return AMDGPU::V_ADD_F32_e64; From 1a51a42d4c633cd1a1a84878b2a3dce6764473b4 Mon Sep 17 00:00:00 2001 From: Akash Dutta Date: Wed, 6 Aug 2025 16:24:08 -0500 Subject: [PATCH 06/16] removing repetitive code, capitalize vars --- .../Target/AMDGPU/GCNPreRAOptimizations.cpp | 95 +++++++------------ 1 file changed, 36 insertions(+), 59 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp index 0f7009a6ea394..f56d73e990269 100644 --- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp +++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp @@ -318,25 +318,25 @@ bool GCNPreRAOptimizationsImpl::createListOfPackedInstr( int NumInst = 0; auto E = BB->end(); - auto schedModel = TII->getSchedModel(); - const MCSchedClassDesc *schedClassDesc = schedModel.resolveSchedClass(&BeginMI); - const int NumMFMACycles = schedModel.getWriteProcResBegin(schedClassDesc)->ReleaseAtCycle; - int totalCyclesBetweenCandidates = 0; + auto SchedModel = TII->getSchedModel(); + const MCSchedClassDesc *SchedClassDesc = SchedModel.resolveSchedClass(&BeginMI); + const int NumMFMACycles = SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle; + int TotalCyclesBetweenCandidates = 0; for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) { MachineInstr &Instr = *I; - const MCSchedClassDesc *instrSchedClassDesc = schedModel.resolveSchedClass(&Instr); - totalCyclesBetweenCandidates += schedModel.getWriteProcResBegin(instrSchedClassDesc)->ReleaseAtCycle; + const MCSchedClassDesc *instrSchedClassDesc = SchedModel.resolveSchedClass(&Instr); + TotalCyclesBetweenCandidates += SchedModel.getWriteProcResBegin(instrSchedClassDesc)->ReleaseAtCycle; if (Instr.isMetaInstruction()) continue; if (Instr.isTerminator()) return false; - if (totalCyclesBetweenCandidates > NumMFMACycles) + if (TotalCyclesBetweenCandidates > NumMFMACycles) return false; if ((isUnpackingSupportedInstr(Instr)) && TII->isNeverCoissue(Instr)) { - totalCyclesBetweenCandidates += 1; + TotalCyclesBetweenCandidates += 1; instrsToUnpack.insert(&Instr); } } @@ -411,10 +411,8 @@ SmallVector GCNPreRAOptimizationsImpl::insertUnpackedMI( if (isVreg_64) { Op0L_Op1L->getOperand(0).setIsUndef(); } - else { - if (I.getOperand(0).isUndef()) { - Op0L_Op1L->getOperand(0).setIsUndef(); - } + else if (I.getOperand(0).isUndef()){ + Op0L_Op1L->getOperand(0).setIsUndef(); } LIS->InsertMachineInstrInMaps(*Op0L_Op1L); @@ -499,58 +497,37 @@ void GCNPreRAOptimizationsImpl::insertMI(MachineInstr &I) { TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0); const TargetRegisterClass *SrcRC = TRI->getSubClassWithSubReg(Src0RC, 1); - if ((Src1RC->getID() == AMDGPU::SGPR_64RegClassID) || - (Src0RC->getID() == AMDGPU::SGPR_64RegClassID)) { - if (Src1RC->getID() == AMDGPU::SGPR_64RegClassID) { - // try with sgpr32 - SmallVector copyInstrs = copyToVregAndInsertMI(I, 4); - MachineInstr *CopySGPR1 = copyInstrs[0]; - MachineInstr *CopySGPR2 = copyInstrs[1]; - - if (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID) { - SmallVector unpackedInstrs = insertUnpackedMI( - I, DstMO, SrcMO1, CopySGPR1->getOperand(0), SrcMO1, - CopySGPR2->getOperand(0), true); - unpackedInstrs[0]->addRegisterKilled(unpackedInstrs[0]->getOperand(2).getReg(), TRI); - unpackedInstrs[1]->addRegisterKilled(unpackedInstrs[1]->getOperand(2).getReg(), TRI); - } else { - SmallVector unpackedInstrs = insertUnpackedMI( - I, DstMO, SrcMO1, CopySGPR1->getOperand(0), SrcMO1, - CopySGPR2->getOperand(0), false); - unpackedInstrs[0]->addRegisterKilled(unpackedInstrs[0]->getOperand(2).getReg(), TRI); - unpackedInstrs[1]->addRegisterKilled(unpackedInstrs[1]->getOperand(2).getReg(), TRI); - } - } - else { - SmallVector copyInstrs = copyToVregAndInsertMI(I, 2); - MachineInstr *CopySGPR1 = copyInstrs[0]; - MachineInstr *CopySGPR2 = copyInstrs[1]; - - if (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID) { - SmallVector unpackedInstrs = insertUnpackedMI( - I, DstMO, CopySGPR1->getOperand(0), SrcMO2, CopySGPR2->getOperand(0), SrcMO2, true); - unpackedInstrs[0]->addRegisterKilled(unpackedInstrs[0]->getOperand(1).getReg(), TRI); - unpackedInstrs[1]->addRegisterKilled(unpackedInstrs[1]->getOperand(1).getReg(), TRI); - } else { - SmallVector unpackedInstrs = insertUnpackedMI( - I, DstMO, CopySGPR1->getOperand(0), SrcMO2, CopySGPR2->getOperand(0), SrcMO2, false); - unpackedInstrs[0]->addRegisterKilled(unpackedInstrs[0]->getOperand(1).getReg(), TRI); - unpackedInstrs[1]->addRegisterKilled(unpackedInstrs[1]->getOperand(1).getReg(), TRI); - } - } - return; - } + if (Src1RC->getID() == AMDGPU::SGPR_64RegClassID) { + // try with sgpr32 + SmallVector copyInstrs = copyToVregAndInsertMI(I, 4); + MachineInstr *CopySGPR1 = copyInstrs[0]; + MachineInstr *CopySGPR2 = copyInstrs[1]; - if (DstRC->getID() == AMDGPU::VReg_512_Align2RegClassID) { + bool isVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID); SmallVector unpackedInstrs = insertUnpackedMI( - I, DstMO, SrcMO1, SrcMO2, SrcMO1, - SrcMO2, false); + I, DstMO, SrcMO1, CopySGPR1->getOperand(0), SrcMO1, + CopySGPR2->getOperand(0), isVReg64); + unpackedInstrs[0]->addRegisterKilled(unpackedInstrs[0]->getOperand(2).getReg(), TRI); + unpackedInstrs[1]->addRegisterKilled(unpackedInstrs[1]->getOperand(2).getReg(), TRI); + return; } - else if (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID) { + else if (Src0RC->getID() == AMDGPU::SGPR_64RegClassID) { + SmallVector copyInstrs = copyToVregAndInsertMI(I, 2); + MachineInstr *CopySGPR1 = copyInstrs[0]; + MachineInstr *CopySGPR2 = copyInstrs[1]; + + bool isVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID); SmallVector unpackedInstrs = insertUnpackedMI( - I, DstMO, SrcMO1, SrcMO2, SrcMO1, - SrcMO2, true); + I, DstMO, CopySGPR1->getOperand(0), SrcMO2, CopySGPR2->getOperand(0), SrcMO2, isVReg64); + unpackedInstrs[0]->addRegisterKilled(unpackedInstrs[0]->getOperand(1).getReg(), TRI); + unpackedInstrs[1]->addRegisterKilled(unpackedInstrs[1]->getOperand(1).getReg(), TRI); + return; } + + bool isVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID); + SmallVector unpackedInstrs = insertUnpackedMI( + I, DstMO, SrcMO1, SrcMO2, SrcMO1, + SrcMO2, isVReg64); return; } From e9056e866ab3dd91e145430e83b9603f76d8b486 Mon Sep 17 00:00:00 2001 From: Akash Dutta Date: Fri, 15 Aug 2025 18:00:36 -0500 Subject: [PATCH 07/16] adding support for FP16 ops --- .../Target/AMDGPU/GCNPreRAOptimizations.cpp | 316 +++++++++++++++++- ...unpack-non-coissue-insts-post-scheduler.ll | 116 ------- 2 files changed, 302 insertions(+), 130 deletions(-) delete mode 100644 llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.ll diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp index f56d73e990269..33e07c5a16d97 100644 --- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp +++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp @@ -44,13 +44,14 @@ #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/InitializePasses.h" -#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SetVector.h" #include "SIInstrInfo.h" #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/InitializePasses.h" #include "GCNSchedStrategy.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineScheduler.h" +#include using namespace llvm; #define DEBUG_TYPE "amdgpu-pre-ra-optimizations" @@ -65,7 +66,7 @@ class GCNPreRAOptimizationsImpl { LiveIntervals *LIS; bool processReg(Register Reg); - bool createListOfPackedInstr(MachineInstr &BeginMI, DenseSet &instrsToUnpack); + bool createListOfPackedInstr(MachineInstr &BeginMI, SetVector &instrsToUnpack, uint16_t NumMFMACycles); bool isUnpackingSupportedInstr(MachineInstr &MI) const; void insertMI(MachineInstr &I); uint16_t mapToUnpackedOpcode(MachineInstr &I); @@ -75,6 +76,10 @@ class GCNPreRAOptimizationsImpl { insertUnpackedMI(MachineInstr &I, MachineOperand &DstMO, MachineOperand &LoSrcMO1, MachineOperand &LoSrcMO2, MachineOperand &HiSrcMO1, MachineOperand &HiSrcMO2, bool isVreg_64); + void processF16Unpacking(MachineInstr &I, uint16_t AvailableBudget); + bool IsF16MaskSet; + Register MaskLo; //mask to extract lower 16 bits for F16 packed instructions + Register ShiftAmt; //mask to extract higher 16 bits from F16 packed instructions public: GCNPreRAOptimizationsImpl(LiveIntervals *LS) : LIS(LS) {} @@ -252,6 +257,8 @@ bool GCNPreRAOptimizationsImpl::isUnpackingSupportedInstr(MachineInstr &MI) cons switch (Opcode) { case AMDGPU::V_PK_ADD_F32: case AMDGPU::V_PK_MUL_F32: + case AMDGPU::V_PK_MUL_F16: + case AMDGPU::V_PK_ADD_F16: return true; default: @@ -270,6 +277,10 @@ uint16_t GCNPreRAOptimizationsImpl::mapToUnpackedOpcode(MachineInstr &I) { return AMDGPU::V_ADD_F32_e64; case AMDGPU::V_PK_MUL_F32: return AMDGPU::V_MUL_F32_e64; + case AMDGPU::V_PK_ADD_F16: + return AMDGPU::V_ADD_F16_e64; + case AMDGPU::V_PK_MUL_F16: + return AMDGPU::V_MUL_F16_e64; default: return std::numeric_limits::max(); @@ -312,16 +323,15 @@ GCNPreRAOptimizationsImpl::copyToVregAndInsertMI(MachineInstr &I, } bool GCNPreRAOptimizationsImpl::createListOfPackedInstr( - MachineInstr &BeginMI, DenseSet &instrsToUnpack) { + MachineInstr &BeginMI, SetVector &instrsToUnpack, uint16_t NumMFMACycles) { auto *BB = BeginMI.getParent(); auto *MF = BB->getParent(); int NumInst = 0; auto E = BB->end(); - auto SchedModel = TII->getSchedModel(); - const MCSchedClassDesc *SchedClassDesc = SchedModel.resolveSchedClass(&BeginMI); - const int NumMFMACycles = SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle; + int TotalCyclesBetweenCandidates = 0; + auto SchedModel = TII->getSchedModel(); for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) { MachineInstr &Instr = *I; const MCSchedClassDesc *instrSchedClassDesc = SchedModel.resolveSchedClass(&Instr); @@ -334,10 +344,41 @@ bool GCNPreRAOptimizationsImpl::createListOfPackedInstr( if (TotalCyclesBetweenCandidates > NumMFMACycles) return false; - + if ((isUnpackingSupportedInstr(Instr)) && TII->isNeverCoissue(Instr)) { - TotalCyclesBetweenCandidates += 1; - instrsToUnpack.insert(&Instr); + if ((Instr.getOpcode() == AMDGPU::V_PK_MUL_F16) || (Instr.getOpcode() == AMDGPU::V_PK_ADD_F16)){ + // unpacking packed F16 instructions requires multiple instructions. Instructions are issued to extract lower and higher bits for each operand + // Instructions are then issued for 2 unpacked instructions, and additional instructions to put them back into the original destination register + // The following sequence of instructions are issued + + // The next two are needed to move masks into vgprs. Ideally, immediates should be used. However, if one of the source operands are sgpr/sregs, + // then immediates are not allowed. Hence, the need to move these into vgprs + + // vgpr_32 = V_MOV_B32_e32 65535 + // vgpr_32 = V_MOV_B32_e32 16 + + // vgpr_32 = V_AND_B32_e32 sub1:sreg_64, vgpr_32 + // vgpr_32 = V_LSHRREV_B32_e64 vgpr_32, sub1:sreg_64 + // vgpr_32 = V_AND_B32_e32 vgpr_32, vgpr_32 + // vgpr_32 = V_LSHRREV_B32_e64 vgpr_32, vgpr_32 + // vgpr_32 = V_MUL_F16_e64 0, killed vgpr_32, 0, killed vgpr_32, 0, 0 + // vgpr_32 = V_MUL_F16_e64 0, killed vgpr_32, 0, killed vgpr_32, 0, 0 + // vgpr_32 = V_LSHLREV_B32_e64 vgpr_32, vgpr_32 + // dst_reg = V_OR_B32_e64 vgpr_32, vgpr_32 + + // we need to issue the MOV instructions above only once. Once these are issued, the IsF16MaskSet flag is set + // subsequent unpacking only needs to issue the remaining instructions + // The number of latency cycles for each instruction above is 1. It's hard coded into the code to reduce code complexity. + if (IsF16MaskSet) + TotalCyclesBetweenCandidates += 7; + else + TotalCyclesBetweenCandidates += 9; + } + else + TotalCyclesBetweenCandidates += 1; + + if (!(TotalCyclesBetweenCandidates > NumMFMACycles)) + instrsToUnpack.insert(&Instr); } } return true; @@ -531,6 +572,242 @@ void GCNPreRAOptimizationsImpl::insertMI(MachineInstr &I) { return; } +void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I, uint16_t AvailableBudget) { + MachineBasicBlock &MBB = *I.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + + MachineOperand &DstMO = I.getOperand(0); + MachineOperand &SrcMO0 = I.getOperand(2); + MachineOperand &SrcMO1 = I.getOperand(4); + + Register DstReg = DstMO.getReg(); + Register SrcReg0 = SrcMO0.getReg(); + Register SrcReg1 = SrcMO1.getReg(); + + const DebugLoc &DL = I.getDebugLoc(); + + const TargetRegisterClass *RC = &AMDGPU::VGPR_32RegClass; + auto SchedModel = TII->getSchedModel(); + + uint16_t AddlCyclesConsumed = 0; + SetVector ListOfNewInstructions; + + auto BuildImm = [&](uint32_t Val) -> std::pair { + Register ImmReg = MRI.createVirtualRegister(RC); + auto newMI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), ImmReg) + .addImm(Val); + LIS->InsertMachineInstrInMaps(*newMI); + const MCSchedClassDesc *SchedClassDesc = SchedModel.resolveSchedClass(newMI); + uint16_t LatencyCycles = SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle; + return {ImmReg, LatencyCycles}; + }; + + + if (!IsF16MaskSet) { + std::pair RegAndLatency = BuildImm(0x0000FFFF); + MaskLo = RegAndLatency.first; //mask for lower 16 bits + AddlCyclesConsumed += RegAndLatency.second; + RegAndLatency = BuildImm(16); + ShiftAmt = RegAndLatency.first; //mask for higher 16 bits + AddlCyclesConsumed += RegAndLatency.second; + IsF16MaskSet = true; + } + + Register Src0_Lo = MRI.createVirtualRegister(RC); + Register Src1_Lo = MRI.createVirtualRegister(RC); + Register Src0_Hi = MRI.createVirtualRegister(RC); + Register Src1_Hi = MRI.createVirtualRegister(RC); + Register Input0 = MRI.createVirtualRegister(RC); + Register Input1 = MRI.createVirtualRegister(RC); + + unsigned SubRegID = 0; + if (SrcMO0.getSubReg()) + SubRegID = SrcMO0.getSubReg(); + + int src0_modifiers_Idx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src0_modifiers); + int src1_modifiers_Idx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src1_modifiers); + unsigned src0_Mods = I.getOperand(src0_modifiers_Idx).getImm(); + unsigned src1_Mods = I.getOperand(src1_modifiers_Idx).getImm(); + int clampIdx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::clamp); + int64_t clampVal = I.getOperand(clampIdx).getImm(); + + // handle op_sel for src0 + if (src0_Mods & SISrcMods::OP_SEL_0) { + // if op_sel is set, select higher 16 bits and copy into lower 16 bits of new vgpr + MachineInstrBuilder LoInput0_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), Src0_Lo) + .addReg(ShiftAmt); + if (SubRegID) + LoInput0_MI.addReg(SrcReg0, 0, SubRegID); + else + LoInput0_MI.addReg(SrcReg0); + LIS->InsertMachineInstrInMaps(*LoInput0_MI); + } + else { + // if op_sel is not set, select lower 16 bits and copy into lower 16 bits of new vgpr + MachineInstrBuilder LoInput0_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_AND_B32_e32), Src0_Lo); + if (SubRegID) + LoInput0_MI.addReg(SrcReg0, 0, SubRegID); + else + LoInput0_MI.addReg(SrcReg0); + LoInput0_MI.addReg(MaskLo); + LIS->InsertMachineInstrInMaps(*LoInput0_MI); + } + + // handle op_sel_hi for src0 + if (src0_Mods & SISrcMods::OP_SEL_1) { + // if op_sel_hi is set, select higher 16 bits and copy into lower 16 bits of new vgpr + MachineInstrBuilder HiInput0_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), Src0_Hi) + .addReg(ShiftAmt); + if (SubRegID) + HiInput0_MI.addReg(SrcReg0, 0, SubRegID); + else + HiInput0_MI.addReg(SrcReg0); + LIS->InsertMachineInstrInMaps(*HiInput0_MI); + } + else { + // if op_sel_hi is not set, select lower 16 bits and copy into lower 16 bits of new vgpr + MachineInstrBuilder HiInput0_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_AND_B32_e32), Src0_Hi); + if (SubRegID) + HiInput0_MI.addReg(SrcReg0, 0, SubRegID); + else + HiInput0_MI.addReg(SrcReg0); + HiInput0_MI.addReg(MaskLo); + LIS->InsertMachineInstrInMaps(*HiInput0_MI); + } + + SubRegID = 0; + if (SrcMO0.getSubReg()) + SubRegID = SrcMO1.getSubReg(); + // handle op_sel for src1 + if (src1_Mods & SISrcMods::OP_SEL_0) { + // if op_sel is set, select higher 16 bits and copy into lower 16 bits of new vgpr + MachineInstrBuilder LoInput1_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), Src1_Lo) + .addReg(ShiftAmt); + if (SubRegID) + LoInput1_MI.addReg(SrcReg1, 0, SubRegID); + else + LoInput1_MI.addReg(SrcReg1); + LIS->InsertMachineInstrInMaps(*LoInput1_MI); + } + else { + // if op_sel is not set, select lower 16 bits and copy into lower 16 bits of new vgpr + MachineInstrBuilder LoInput1_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_AND_B32_e32), Src1_Lo); + if (SubRegID) + LoInput1_MI.addReg(SrcReg1, 0, SubRegID); + else + LoInput1_MI.addReg(SrcReg1); + LoInput1_MI.addReg(MaskLo); + LIS->InsertMachineInstrInMaps(*LoInput1_MI); + } + + // handle op_sel_hi for src1 + if (src1_Mods & SISrcMods::OP_SEL_1) { + // if op_sel_hi is set, select higher 16 bits and copy into lower 16 bits of new vgpr + MachineInstrBuilder HiInput1_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), Src1_Hi) + .addReg(ShiftAmt); + if (SubRegID) + HiInput1_MI.addReg(SrcReg1, 0, SubRegID); + else + HiInput1_MI.addReg(SrcReg1); + LIS->InsertMachineInstrInMaps(*HiInput1_MI); + } + else { + // if op_sel_hi is not set, select lower 16 bits and copy into lower 16 bits of new vgpr + MachineInstrBuilder HiInput1_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_AND_B32_e32), Src1_Hi); + if (SubRegID) + HiInput1_MI.addReg(SrcReg1, 0, SubRegID); + else + HiInput1_MI.addReg(SrcReg1); + HiInput1_MI.addReg(MaskLo); + LIS->InsertMachineInstrInMaps(*HiInput1_MI); + } + + Register LoMul = MRI.createVirtualRegister(RC); + Register HiMul = MRI.createVirtualRegister(RC); + + unsigned Lo_src0_mods = 0; + unsigned Lo_src1_mods = 0; + uint16_t unpackedOpcode = mapToUnpackedOpcode(I); + + // Unpacked instructions + MachineInstrBuilder LoMul_MI = BuildMI(MBB, I, DL, TII->get(unpackedOpcode), LoMul); + + if (src0_Mods & SISrcMods::NEG) + Lo_src0_mods |= SISrcMods::NEG; + + LoMul_MI.addImm(Lo_src0_mods); //src0_modifiers + LoMul_MI.addReg(Src0_Lo, RegState::Kill); //src0 + + if (src1_Mods & SISrcMods::NEG) + Lo_src1_mods |= SISrcMods::NEG; + + LoMul_MI.addImm(Lo_src1_mods); //src1_modifiers + LoMul_MI.addReg(Src1_Lo, RegState::Kill); //src1 + LoMul_MI.addImm(clampVal); //clamp + //packed instructions do not support output modifiers. safe to assign them 0 for this use case + LoMul_MI.addImm(0); //omod + + // unpacked instruction with VOP3 encoding for Hi bits + unsigned Hi_src0_mods = 0; + unsigned Hi_src1_mods = 0; + + MachineInstrBuilder HiMul_MI = BuildMI(MBB, I, DL, TII->get(unpackedOpcode), HiMul); + if (src0_Mods & SISrcMods::NEG_HI) + Hi_src0_mods |= SISrcMods::NEG_HI; + + HiMul_MI.addImm(Hi_src0_mods); //src0_modifiers + HiMul_MI.addReg(Src0_Hi, RegState::Kill); //select higher 16 bits if op_sel_hi is set + + if (src1_Mods & SISrcMods::NEG_HI) + Hi_src1_mods |= SISrcMods::NEG_HI; + + HiMul_MI.addImm(Hi_src1_mods); //src0_modifiers + HiMul_MI.addReg(Src1_Hi, RegState::Kill); //select higher 16 bits from src1 if op_sel_hi is set + HiMul_MI.addImm(clampVal); //clamp + //packed instructions do not support output modifiers. safe to assign them 0 for this use case + HiMul_MI.addImm(0); //omod + + // Shift HiMul left by 16 + Register HiMulShifted = MRI.createVirtualRegister(RC); + MachineInstrBuilder HiMulShifted_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHLREV_B32_e64), HiMulShifted) + .addReg(ShiftAmt) + .addReg(HiMul); + + SubRegID = 0; + if (DstMO.getSubReg()) + SubRegID = DstMO.getSubReg(); + // OR LoMul | (HiMul << 16) + MachineInstrBuilder RewriteBackToDst_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_OR_B32_e64)); + if (SubRegID) { + if (DstMO.isUndef()){ + RewriteBackToDst_MI.addDef(DstReg, RegState::Undef, SubRegID); + } + else { + RewriteBackToDst_MI.addDef(DstReg, 0, SubRegID); + } + } + else { + if (DstMO.isUndef()){ + RewriteBackToDst_MI.addDef(DstReg, RegState::Undef); + } + else { + RewriteBackToDst_MI.addDef(DstReg); + } + } + RewriteBackToDst_MI.addReg(LoMul); + RewriteBackToDst_MI.addReg(HiMulShifted); + + LIS->InsertMachineInstrInMaps(*LoMul_MI); + LIS->InsertMachineInstrInMaps(*HiMul_MI); + LIS->InsertMachineInstrInMaps(*HiMulShifted_MI); + LIS->InsertMachineInstrInMaps(*RewriteBackToDst_MI); + LIS->RemoveMachineInstrFromMaps(I); + I.eraseFromParent(); + LIS->removeInterval(DstReg); + LIS->createAndComputeVirtRegInterval(DstReg); + +} + bool GCNPreRAOptimizationsLegacy::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; @@ -569,10 +846,15 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) { // Add RA hints to improve True16 COPY elimination. // Unpack packed instructions to overlap MFMAs. This allows the compiler to co-issue unpacked instructions with MFMA for (MachineBasicBlock &MBB : MF) { - DenseSet instrsToUnpack; + SetVector instrsToUnpack; + IsF16MaskSet = false; + uint16_t NumMFMACycles = 0; + auto SchedModel = TII->getSchedModel(); for (MachineInstr &MI : MBB) { if (SIInstrInfo::isMFMA(MI)){ - createListOfPackedInstr(MI, instrsToUnpack); + const MCSchedClassDesc *SchedClassDesc = SchedModel.resolveSchedClass(&MI); + NumMFMACycles = SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle; + // createListOfPackedInstr(MI, instrsToUnpack, NumMFMACycles); } if (ST.useRealTrue16Insts()){ if (MI.getOpcode() != AMDGPU::COPY) @@ -603,9 +885,15 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) { } if (!instrsToUnpack.empty()) { - for (MachineInstr *MI : instrsToUnpack) - insertMI(*MI); + for (MachineInstr *MI : instrsToUnpack) { + if ((MI->getOpcode() == AMDGPU::V_PK_MUL_F16) || (MI->getOpcode() == AMDGPU::V_PK_ADD_F16)) { + processF16Unpacking(*MI, NumMFMACycles); + } + else { + insertMI(*MI); + } + } } } return Changed; -} +} \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.ll b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.ll deleted file mode 100644 index 5c6d376c92e65..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.ll +++ /dev/null @@ -1,116 +0,0 @@ -; TODO: change variable names. Make test smaller if possible - -; ModuleID = 'LLVMDialectModule' -source_filename = "LLVMDialectModule" -target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" -target triple = "amdgcn-amd-amdhsa" - -@global_smem = external addrspace(3) global [0 x i8], align 16 - -; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) -declare float @llvm.amdgcn.exp2.f32(float) - -; Function Attrs: nofree norecurse nounwind -define amdgpu_kernel void @attn_fwd(ptr addrspace(1) inreg readonly captures(none) %0, ptr addrspace(1) inreg readonly captures(none) %1, ptr addrspace(1) inreg readonly captures(none) %2, ptr addrspace(1) inreg writeonly captures(none) %3, ptr addrspace(1) inreg writeonly captures(none) %4, i32 inreg %5, i32 inreg %6, i32 inreg %7, i32 inreg %8, i32 inreg %9, i32 inreg %10, i32 inreg %11, i32 inreg %12, i32 inreg %13, i32 inreg %14, i32 inreg %15, i32 inreg %16, i32 inreg %17, i32 inreg %18, i32 inreg %19, i32 inreg %20, i32 inreg %21, i32 inreg %22, float inreg %23, i32 inreg %24, ptr addrspace(1) inreg readnone captures(none) %25, i32 inreg %26, ptr addrspace(1) inreg readnone captures(none) %27) local_unnamed_addr { - %29 = tail call i32 @llvm.amdgcn.workgroup.id.x() - - %96 = sext i32 %8 to i64 - %97 = getelementptr half, ptr addrspace(1) %1, i64 %96 - - %115 = icmp slt i32 %29, 16384 - - %135 = icmp slt i32 %29, 1 - - %215 = getelementptr half, ptr addrspace(3) @global_smem, i32 %29 - %216 = load <8 x half>, ptr addrspace(3) %215, align 16 - - %276 = shl nuw nsw i32 %29, 7 - - %396 = getelementptr half, ptr addrspace(1) %97, i64 1 - %397 = sext i32 %13 to i64 - %398 = getelementptr half, ptr addrspace(1) %97, i64 %397 - - %536 = fsub float 0xFFF0000000000000, 0.5 - %537 = tail call float @llvm.amdgcn.exp2.f32(float %536) - - %538 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %29 - %539 = load <8 x half>, ptr addrspace(3) %538, align 16 - - %573 = icmp ult i32 1, 511 - br i1 %573, label %575, label %574 - -574: ; preds = %28 - br label %575 - -575: ; preds = %574, %28 - %610 = shufflevector <8 x half> %539, <8 x half> poison, <2 x i32> - - br label %686 - -686: ; preds = %575, %686 - %.pn347561 = phi float [ %537, %575 ], [ %1329, %686 ] - - - %690 = phi i32 [ 0, %575 ], [ %1120, %686 ] - %691 = phi ptr addrspace(1) [ %398, %575 ], [ %1117, %686 ] - %692 = phi ptr addrspace(1) [ %396, %575 ], [ %1116, %686 ] - - %695 = phi <2 x half> [ %610, %575 ], [ %1414, %686 ] - - - %759 = phi <2 x float> [ zeroinitializer, %575 ], [ %1478, %686 ] - %760 = phi <2 x float> [ zeroinitializer, %575 ], [ %1478, %686 ] - - %tmp6 = phi <2 x float> [ zeroinitializer, %575 ], [ %tmp5, %686 ] - %tmp7 = phi <2 x float> [ zeroinitializer, %575 ], [ %tmp5, %686 ] - - %871 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %216, <8 x half> %216, <16 x float> zeroinitializer, i32 0, i32 0, i32 0) - tail call void @llvm.amdgcn.s.setprio(i16 0) - %872 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %216, <8 x half> %216, <16 x float> %871, i32 0, i32 0, i32 0) - %879 = extractelement <16 x float> %872, i64 0 - - - %957 = insertelement <2 x float> poison, float %.pn347561, i64 0 - %958 = shufflevector <2 x float> %957, <2 x float> poison, <2 x i32> zeroinitializer - %959 = fmul <2 x float> %759, %958 - %960 = fmul <2 x float> %760, %958 - - %tmp1 = fmul <2 x float> %tmp6, %958 - %tmp2 = fmul <2 x float> %tmp7, %958 - - %1048 = shufflevector <2 x half> %695, <2 x half> poison, <8 x i32> - - %1116 = getelementptr half, ptr addrspace(1) %692, i64 1 - %1117 = getelementptr half, ptr addrspace(1) %691, i64 %397 - - %1119 = icmp slt i32 %690, 2 - %1120 = select i1 %1119, i32 %690, i32 0 - %.idx359 = shl i32 %1120, 14 - %1121 = getelementptr i8, ptr addrspace(3) @global_smem, i32 %.idx359 - - %1140 = shufflevector <8 x half> %1048, <8 x half> %1048, <8 x i32> - - %1157 = shufflevector <2 x float> %959, <2 x float> %960, <16 x i32> - %tmp3 = shufflevector <2 x float> %tmp1, <2 x float> %tmp2, <16 x i32> - - %1173 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1048, <8 x half> %1140, <16 x float> %1157, i32 0, i32 0, i32 0) - %tmp4 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1048, <8 x half> %1140, <16 x float> %tmp3, i32 0, i32 0, i32 0) - - - %1329 = tail call float @llvm.amdgcn.exp2.f32(float %879) - - %.idx367 = shl i32 %690, 14 - %1404 = getelementptr i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.idx367 - - %1412 = add nuw nsw i32 0, 64 - %1413 = icmp samesign ult i32 0, 7936 - %1414 = shufflevector <8 x half> %1140, <8 x half> poison, <2 x i32> - - %1478 = shufflevector <16 x float> %1173, <16 x float> poison, <2 x i32> - %tmp5 = shufflevector <16 x float> %tmp4, <16 x float> poison, <2 x i32> - - br i1 %1413, label %686, label %1510 - -1510: ; preds = %686 - ret void -} From 5cb47d262a7d865e2ce9fa006e079db2676b4edb Mon Sep 17 00:00:00 2001 From: Akash Dutta Date: Fri, 15 Aug 2025 18:24:13 -0500 Subject: [PATCH 08/16] code fix --- llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp index 33e07c5a16d97..5dac4a210101e 100644 --- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp +++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp @@ -854,7 +854,7 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) { if (SIInstrInfo::isMFMA(MI)){ const MCSchedClassDesc *SchedClassDesc = SchedModel.resolveSchedClass(&MI); NumMFMACycles = SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle; - // createListOfPackedInstr(MI, instrsToUnpack, NumMFMACycles); + createListOfPackedInstr(MI, instrsToUnpack, NumMFMACycles); } if (ST.useRealTrue16Insts()){ if (MI.getOpcode() != AMDGPU::COPY) From 178a36354b4b109c4c59572f90205457386c77e6 Mon Sep 17 00:00:00 2001 From: Akash Dutta Date: Sun, 17 Aug 2025 09:32:00 -0500 Subject: [PATCH 09/16] clang-formatted and mir tests added --- .../Target/AMDGPU/GCNPreRAOptimizations.cpp | 493 ++++++++++-------- ...npack-non-coissue-insts-post-scheduler.mir | 209 ++++++++ 2 files changed, 482 insertions(+), 220 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.mir diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp index 5dac4a210101e..9a2f898dcb2de 100644 --- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp +++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp @@ -28,30 +28,28 @@ /// and a VGPR_16. If we use the VGPR_16 that corresponds to the lo16 bits of /// the VGPR_32, the COPY can be completely eliminated. /// -/// Additionally, this pass also unpacks packed instructions (V_PK_MUL_F32 and V_PK_ADD_F32) -/// adjacent to MFMAs such that they can be co-issued. -/// This helps with overlapping MFMA and certain vector instructions in machine schedules +/// Additionally, this pass also unpacks packed instructions (V_PK_MUL_F32 and +/// V_PK_ADD_F32) adjacent to MFMAs such that they can be co-issued. This helps +/// with overlapping MFMA and certain vector instructions in machine schedules /// and is expected to improve performance. -/// Only those packed instructions are unpacked that are overlapped by the MFMA latency. -/// Rest should remain untouched. +/// Only those packed instructions are unpacked that are overlapped by the MFMA +/// latency. Rest should remain untouched. //===----------------------------------------------------------------------===// -#include "GCNPreRAOptimizations.h" #include "AMDGPU.h" +#include "GCNPreRAOptimizations.h" +#include "GCNSchedStrategy.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" #include "SIRegisterInfo.h" +#include "llvm/ADT/SetVector.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/InitializePasses.h" -#include "llvm/ADT/SetVector.h" -#include "SIInstrInfo.h" -#include "llvm/CodeGen/RegisterScavenging.h" -#include "llvm/InitializePasses.h" -#include "GCNSchedStrategy.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineScheduler.h" -#include +#include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/InitializePasses.h" using namespace llvm; #define DEBUG_TYPE "amdgpu-pre-ra-optimizations" @@ -66,20 +64,24 @@ class GCNPreRAOptimizationsImpl { LiveIntervals *LIS; bool processReg(Register Reg); - bool createListOfPackedInstr(MachineInstr &BeginMI, SetVector &instrsToUnpack, uint16_t NumMFMACycles); + bool createListOfPackedInstr(MachineInstr &BeginMI, + SetVector &instrsToUnpack, + uint16_t NumMFMACycles); bool isUnpackingSupportedInstr(MachineInstr &MI) const; void insertMI(MachineInstr &I); uint16_t mapToUnpackedOpcode(MachineInstr &I); SmallVector copyToVregAndInsertMI(MachineInstr &I, unsigned SGPRSrcPos); SmallVector - insertUnpackedMI(MachineInstr &I, MachineOperand &DstMO, MachineOperand &LoSrcMO1, - MachineOperand &LoSrcMO2, MachineOperand &HiSrcMO1, MachineOperand &HiSrcMO2, + insertUnpackedMI(MachineInstr &I, MachineOperand &DstMO, + MachineOperand &LoSrcMO1, MachineOperand &LoSrcMO2, + MachineOperand &HiSrcMO1, MachineOperand &HiSrcMO2, bool isVreg_64); void processF16Unpacking(MachineInstr &I, uint16_t AvailableBudget); bool IsF16MaskSet; - Register MaskLo; //mask to extract lower 16 bits for F16 packed instructions - Register ShiftAmt; //mask to extract higher 16 bits from F16 packed instructions + Register MaskLo; // mask to extract lower 16 bits for F16 packed instructions + Register + ShiftAmt; // mask to extract higher 16 bits from F16 packed instructions public: GCNPreRAOptimizationsImpl(LiveIntervals *LS) : LIS(LS) {} @@ -252,18 +254,18 @@ bool GCNPreRAOptimizationsImpl::processReg(Register Reg) { return true; } -bool GCNPreRAOptimizationsImpl::isUnpackingSupportedInstr(MachineInstr &MI) const { +bool GCNPreRAOptimizationsImpl::isUnpackingSupportedInstr( + MachineInstr &MI) const { unsigned Opcode = MI.getOpcode(); switch (Opcode) { - case AMDGPU::V_PK_ADD_F32: - case AMDGPU::V_PK_MUL_F32: - case AMDGPU::V_PK_MUL_F16: - case AMDGPU::V_PK_ADD_F16: - return true; - - default: - return false; + case AMDGPU::V_PK_ADD_F32: + case AMDGPU::V_PK_MUL_F32: + case AMDGPU::V_PK_MUL_F16: + case AMDGPU::V_PK_ADD_F16: + return true; + default: + return false; } } @@ -273,23 +275,22 @@ uint16_t GCNPreRAOptimizationsImpl::mapToUnpackedOpcode(MachineInstr &I) { // VOP3 instructions allow VOP3P source modifiers to be translated to VOP3 // e32 instructions are VOP2 and don't allow source modifiers switch (Opcode) { - case AMDGPU::V_PK_ADD_F32: - return AMDGPU::V_ADD_F32_e64; - case AMDGPU::V_PK_MUL_F32: - return AMDGPU::V_MUL_F32_e64; - case AMDGPU::V_PK_ADD_F16: - return AMDGPU::V_ADD_F16_e64; - case AMDGPU::V_PK_MUL_F16: - return AMDGPU::V_MUL_F16_e64; - default: - return std::numeric_limits::max(); - + case AMDGPU::V_PK_ADD_F32: + return AMDGPU::V_ADD_F32_e64; + case AMDGPU::V_PK_MUL_F32: + return AMDGPU::V_MUL_F32_e64; + case AMDGPU::V_PK_ADD_F16: + return AMDGPU::V_ADD_F16_e64; + case AMDGPU::V_PK_MUL_F16: + return AMDGPU::V_MUL_F16_e64; + default: + return std::numeric_limits::max(); } } SmallVector GCNPreRAOptimizationsImpl::copyToVregAndInsertMI(MachineInstr &I, - unsigned SGPRSrcPos) { + unsigned SGPRSrcPos) { SmallVector MIList; MachineBasicBlock &MBB = *I.getParent(); @@ -323,37 +324,46 @@ GCNPreRAOptimizationsImpl::copyToVregAndInsertMI(MachineInstr &I, } bool GCNPreRAOptimizationsImpl::createListOfPackedInstr( - MachineInstr &BeginMI, SetVector &instrsToUnpack, uint16_t NumMFMACycles) { + MachineInstr &BeginMI, SetVector &instrsToUnpack, + uint16_t NumMFMACycles) { auto *BB = BeginMI.getParent(); auto *MF = BB->getParent(); int NumInst = 0; auto E = BB->end(); - + int TotalCyclesBetweenCandidates = 0; auto SchedModel = TII->getSchedModel(); for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) { MachineInstr &Instr = *I; - const MCSchedClassDesc *instrSchedClassDesc = SchedModel.resolveSchedClass(&Instr); - TotalCyclesBetweenCandidates += SchedModel.getWriteProcResBegin(instrSchedClassDesc)->ReleaseAtCycle; + const MCSchedClassDesc *instrSchedClassDesc = + SchedModel.resolveSchedClass(&Instr); + TotalCyclesBetweenCandidates += + SchedModel.getWriteProcResBegin(instrSchedClassDesc)->ReleaseAtCycle; if (Instr.isMetaInstruction()) continue; if (Instr.isTerminator()) return false; - + if (TotalCyclesBetweenCandidates > NumMFMACycles) return false; - + if ((isUnpackingSupportedInstr(Instr)) && TII->isNeverCoissue(Instr)) { - if ((Instr.getOpcode() == AMDGPU::V_PK_MUL_F16) || (Instr.getOpcode() == AMDGPU::V_PK_ADD_F16)){ - // unpacking packed F16 instructions requires multiple instructions. Instructions are issued to extract lower and higher bits for each operand - // Instructions are then issued for 2 unpacked instructions, and additional instructions to put them back into the original destination register - // The following sequence of instructions are issued - - // The next two are needed to move masks into vgprs. Ideally, immediates should be used. However, if one of the source operands are sgpr/sregs, - // then immediates are not allowed. Hence, the need to move these into vgprs - + if ((Instr.getOpcode() == AMDGPU::V_PK_MUL_F16) || + (Instr.getOpcode() == AMDGPU::V_PK_ADD_F16)) { + // unpacking packed F16 instructions requires multiple instructions. + // Instructions are issued to extract lower and higher bits for each + // operand Instructions are then issued for 2 unpacked instructions, and + // additional instructions to put them back into the original + // destination register The following sequence of instructions are + // issued + + // The next two are needed to move masks into vgprs. Ideally, immediates + // should be used. However, if one of the source operands are + // sgpr/sregs, then immediates are not allowed. Hence, the need to move + // these into vgprs + // vgpr_32 = V_MOV_B32_e32 65535 // vgpr_32 = V_MOV_B32_e32 16 @@ -365,18 +375,19 @@ bool GCNPreRAOptimizationsImpl::createListOfPackedInstr( // vgpr_32 = V_MUL_F16_e64 0, killed vgpr_32, 0, killed vgpr_32, 0, 0 // vgpr_32 = V_LSHLREV_B32_e64 vgpr_32, vgpr_32 // dst_reg = V_OR_B32_e64 vgpr_32, vgpr_32 - - // we need to issue the MOV instructions above only once. Once these are issued, the IsF16MaskSet flag is set - // subsequent unpacking only needs to issue the remaining instructions - // The number of latency cycles for each instruction above is 1. It's hard coded into the code to reduce code complexity. - if (IsF16MaskSet) + + // we need to issue the MOV instructions above only once. Once these are + // issued, the IsF16MaskSet flag is set subsequent unpacking only needs + // to issue the remaining instructions The number of latency cycles for + // each instruction above is 1. It's hard coded into the code to reduce + // code complexity. + if (IsF16MaskSet) TotalCyclesBetweenCandidates += 7; else TotalCyclesBetweenCandidates += 9; - } - else + } else TotalCyclesBetweenCandidates += 1; - + if (!(TotalCyclesBetweenCandidates > NumMFMACycles)) instrsToUnpack.insert(&Instr); } @@ -385,8 +396,9 @@ bool GCNPreRAOptimizationsImpl::createListOfPackedInstr( } SmallVector GCNPreRAOptimizationsImpl::insertUnpackedMI( - MachineInstr &I, MachineOperand &DstMO, MachineOperand &LoSrcMO1, MachineOperand &LoSrcMO2, - MachineOperand &HiSrcMO1, MachineOperand &HiSrcMO2, bool isVreg_64) { + MachineInstr &I, MachineOperand &DstMO, MachineOperand &LoSrcMO1, + MachineOperand &LoSrcMO2, MachineOperand &HiSrcMO1, + MachineOperand &HiSrcMO2, bool isVreg_64) { SmallVector MIList; MachineBasicBlock &MBB = *I.getParent(); @@ -404,103 +416,117 @@ SmallVector GCNPreRAOptimizationsImpl::insertUnpackedMI( const MCInstrDesc instrDesc = I.getDesc(); - int clampIdx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::clamp); + int clampIdx = + AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::clamp); int64_t clampVal = I.getOperand(clampIdx).getImm(); - int src0_modifiers_Idx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src0_modifiers); - int src1_modifiers_Idx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src1_modifiers); + int src0_modifiers_Idx = + AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src0_modifiers); + int src1_modifiers_Idx = + AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src1_modifiers); unsigned src0_Mods = I.getOperand(src0_modifiers_Idx).getImm(); unsigned src1_Mods = I.getOperand(src1_modifiers_Idx).getImm(); - //don't worry about abs values. Packed instructions (VOP3P) do not support them + // don't worry about abs values. Packed instructions (VOP3P) do not support + // them unsigned Lo_src0_mods = 0; unsigned Lo_src1_mods = 0; uint16_t unpackedOpcode = mapToUnpackedOpcode(I); MachineInstrBuilder Op0L_Op1L = BuildMI(MBB, I, DL, TII->get(unpackedOpcode)); - Op0L_Op1L.addDef(DstReg, 0, DestSubIdx); //vdst + Op0L_Op1L.addDef(DstReg, 0, DestSubIdx); // vdst if (src0_Mods & SISrcMods::OP_SEL_0) { if (src0_Mods & SISrcMods::NEG) { Lo_src0_mods |= SISrcMods::NEG; } - Op0L_Op1L.addImm(Lo_src0_mods); //src0_modifiers - unsigned Src0SubIdx = TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub1); - Op0L_Op1L.addReg(LoSrcMO1.getReg(), 0, Src0SubIdx); //src0 - } - else { - Op0L_Op1L.addImm(Lo_src0_mods); //src0_modifiers - unsigned Src0SubIdx = TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub0); - Op0L_Op1L.addReg(LoSrcMO1.getReg(), 0, Src0SubIdx); //src0 //if op_sel == 0, select register 0 of reg:sub0_sub1 + Op0L_Op1L.addImm(Lo_src0_mods); // src0_modifiers + unsigned Src0SubIdx = + TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub1); + Op0L_Op1L.addReg(LoSrcMO1.getReg(), 0, Src0SubIdx); // src0 + } else { + Op0L_Op1L.addImm(Lo_src0_mods); // src0_modifiers + unsigned Src0SubIdx = + TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub0); + Op0L_Op1L.addReg(LoSrcMO1.getReg(), 0, + Src0SubIdx); // src0 //if op_sel == 0, select register 0 of + // reg:sub0_sub1 } - if (src1_Mods & SISrcMods::OP_SEL_0) { if (src1_Mods & SISrcMods::NEG) { Lo_src1_mods |= SISrcMods::NEG; } - Op0L_Op1L.addImm(Lo_src1_mods); //src0_modifiers - unsigned Src1SubIdx = TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub1); - Op0L_Op1L.addReg(LoSrcMO2.getReg(), 0, Src1SubIdx); //src0 - } - else { - Op0L_Op1L.addImm(Lo_src1_mods); //src0_modifiers - unsigned Src1SubIdx = TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub0); - Op0L_Op1L.addReg(LoSrcMO2.getReg(), 0, Src1SubIdx); //src0 //if op_sel_hi == 0, select register 0 of reg:sub0_sub1 - } - Op0L_Op1L.addImm(clampVal); //clamp - //packed instructions do not support output modifiers. safe to assign them 0 for this use case - Op0L_Op1L.addImm(0); //omod + Op0L_Op1L.addImm(Lo_src1_mods); // src0_modifiers + unsigned Src1SubIdx = + TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub1); + Op0L_Op1L.addReg(LoSrcMO2.getReg(), 0, Src1SubIdx); // src0 + } else { + Op0L_Op1L.addImm(Lo_src1_mods); // src0_modifiers + unsigned Src1SubIdx = + TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub0); + Op0L_Op1L.addReg(LoSrcMO2.getReg(), 0, + Src1SubIdx); // src0 //if op_sel_hi == 0, select register 0 + // of reg:sub0_sub1 + } + Op0L_Op1L.addImm(clampVal); // clamp + // packed instructions do not support output modifiers. safe to assign them 0 + // for this use case + Op0L_Op1L.addImm(0); // omod if (isVreg_64) { Op0L_Op1L->getOperand(0).setIsUndef(); - } - else if (I.getOperand(0).isUndef()){ + } else if (I.getOperand(0).isUndef()) { Op0L_Op1L->getOperand(0).setIsUndef(); } LIS->InsertMachineInstrInMaps(*Op0L_Op1L); - SrcSubIdx1 = - TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub1); - SrcSubIdx2 = - TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub1); - DestSubIdx = - TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub1); + SrcSubIdx1 = TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub1); + SrcSubIdx2 = TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub1); + DestSubIdx = TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub1); - //don't worry about abs values. Packed instructions (VOP3P) do not support them + // don't worry about abs values. Packed instructions (VOP3P) do not support + // them unsigned Hi_src0_mods = 0; unsigned Hi_src1_mods = 0; MachineInstrBuilder Op0H_Op1H = BuildMI(MBB, I, DL, TII->get(unpackedOpcode)); - Op0H_Op1H.addDef(DstReg, 0, DestSubIdx); //vdst + Op0H_Op1H.addDef(DstReg, 0, DestSubIdx); // vdst if (src0_Mods & SISrcMods::OP_SEL_1) { if (src0_Mods & SISrcMods::NEG_HI) { Hi_src0_mods |= SISrcMods::NEG; } - Op0H_Op1H.addImm(Hi_src0_mods); //src0_modifiers - unsigned Src0SubIdx = TRI->composeSubRegIndices(HiSrcMO1.getSubReg(), AMDGPU::sub1); - Op0H_Op1H.addReg(HiSrcMO1.getReg(), 0, Src0SubIdx); //src0 - } - else { - Op0H_Op1H.addImm(Hi_src0_mods); //src0_modifiers - unsigned Src0SubIdx = TRI->composeSubRegIndices(HiSrcMO1.getSubReg(), AMDGPU::sub0); - Op0H_Op1H.addReg(HiSrcMO1.getReg(), 0, Src0SubIdx); //src0 //if op_sel_hi == 0, select register 0 of reg:sub0_sub1 + Op0H_Op1H.addImm(Hi_src0_mods); // src0_modifiers + unsigned Src0SubIdx = + TRI->composeSubRegIndices(HiSrcMO1.getSubReg(), AMDGPU::sub1); + Op0H_Op1H.addReg(HiSrcMO1.getReg(), 0, Src0SubIdx); // src0 + } else { + Op0H_Op1H.addImm(Hi_src0_mods); // src0_modifiers + unsigned Src0SubIdx = + TRI->composeSubRegIndices(HiSrcMO1.getSubReg(), AMDGPU::sub0); + Op0H_Op1H.addReg(HiSrcMO1.getReg(), 0, + Src0SubIdx); // src0 //if op_sel_hi == 0, select register 0 + // of reg:sub0_sub1 } if (src1_Mods & SISrcMods::OP_SEL_1) { if (src1_Mods & SISrcMods::NEG_HI) { Hi_src1_mods |= SISrcMods::NEG; } - Op0H_Op1H.addImm(Hi_src1_mods); //src0_modifiers - unsigned Src1SubIdx = TRI->composeSubRegIndices(HiSrcMO2.getSubReg(), AMDGPU::sub1); - Op0H_Op1H.addReg(HiSrcMO2.getReg(), 0, Src1SubIdx); //src0 - } - else { - Op0H_Op1H.addImm(Hi_src1_mods); //src0_modifiers - unsigned Src1SubIdx = TRI->composeSubRegIndices(HiSrcMO2.getSubReg(), AMDGPU::sub0); - Op0H_Op1H.addReg(HiSrcMO2.getReg(), 0, Src1SubIdx); //src0 //if op_sel_hi == 0, select register 0 of reg:sub0_sub1 - } - Op0H_Op1H.addImm(clampVal); //clamp - //packed instructions do not support output modifiers. safe to assign them 0 for this use case - Op0H_Op1H.addImm(0); //omod + Op0H_Op1H.addImm(Hi_src1_mods); // src0_modifiers + unsigned Src1SubIdx = + TRI->composeSubRegIndices(HiSrcMO2.getSubReg(), AMDGPU::sub1); + Op0H_Op1H.addReg(HiSrcMO2.getReg(), 0, Src1SubIdx); // src0 + } else { + Op0H_Op1H.addImm(Hi_src1_mods); // src0_modifiers + unsigned Src1SubIdx = + TRI->composeSubRegIndices(HiSrcMO2.getSubReg(), AMDGPU::sub0); + Op0H_Op1H.addReg(HiSrcMO2.getReg(), 0, + Src1SubIdx); // src0 //if op_sel_hi == 0, select register 0 + // of reg:sub0_sub1 + } + Op0H_Op1H.addImm(clampVal); // clamp + // packed instructions do not support output modifiers. safe to assign them 0 + // for this use case + Op0H_Op1H.addImm(0); // omod LIS->InsertMachineInstrInMaps(*Op0H_Op1H); if (I.getFlag(MachineInstr::MIFlag::NoFPExcept)) { @@ -524,16 +550,15 @@ void GCNPreRAOptimizationsImpl::insertMI(MachineInstr &I) { Register DstReg = I.getOperand(0).getReg(); Register SrcReg1 = I.getOperand(2).getReg(); Register SrcReg2 = I.getOperand(4).getReg(); - MachineOperand &DstMO = I.getOperand(0); MachineOperand &SrcMO1 = I.getOperand(2); MachineOperand &SrcMO2 = I.getOperand(4); - MachineBasicBlock::iterator MII = I; const DebugLoc &DL = I.getDebugLoc(); const TargetRegisterClass *DstRC = MRI.getRegClass(I.getOperand(0).getReg()); const TargetRegisterClass *Src0RC = MRI.getRegClass(I.getOperand(2).getReg()); const TargetRegisterClass *Src1RC = MRI.getRegClass(I.getOperand(4).getReg()); + const TargetRegisterClass *Src0SubRC = TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0); const TargetRegisterClass *SrcRC = TRI->getSubClassWithSubReg(Src0RC, 1); @@ -545,34 +570,38 @@ void GCNPreRAOptimizationsImpl::insertMI(MachineInstr &I) { MachineInstr *CopySGPR2 = copyInstrs[1]; bool isVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID); - SmallVector unpackedInstrs = insertUnpackedMI( - I, DstMO, SrcMO1, CopySGPR1->getOperand(0), SrcMO1, - CopySGPR2->getOperand(0), isVReg64); - unpackedInstrs[0]->addRegisterKilled(unpackedInstrs[0]->getOperand(2).getReg(), TRI); - unpackedInstrs[1]->addRegisterKilled(unpackedInstrs[1]->getOperand(2).getReg(), TRI); + SmallVector unpackedInstrs = + insertUnpackedMI(I, DstMO, SrcMO1, CopySGPR1->getOperand(0), SrcMO1, + CopySGPR2->getOperand(0), isVReg64); + unpackedInstrs[0]->addRegisterKilled( + unpackedInstrs[0]->getOperand(2).getReg(), TRI); + unpackedInstrs[1]->addRegisterKilled( + unpackedInstrs[1]->getOperand(2).getReg(), TRI); return; - } - else if (Src0RC->getID() == AMDGPU::SGPR_64RegClassID) { + } else if (Src0RC->getID() == AMDGPU::SGPR_64RegClassID) { SmallVector copyInstrs = copyToVregAndInsertMI(I, 2); MachineInstr *CopySGPR1 = copyInstrs[0]; MachineInstr *CopySGPR2 = copyInstrs[1]; bool isVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID); - SmallVector unpackedInstrs = insertUnpackedMI( - I, DstMO, CopySGPR1->getOperand(0), SrcMO2, CopySGPR2->getOperand(0), SrcMO2, isVReg64); - unpackedInstrs[0]->addRegisterKilled(unpackedInstrs[0]->getOperand(1).getReg(), TRI); - unpackedInstrs[1]->addRegisterKilled(unpackedInstrs[1]->getOperand(1).getReg(), TRI); + SmallVector unpackedInstrs = + insertUnpackedMI(I, DstMO, CopySGPR1->getOperand(0), SrcMO2, + CopySGPR2->getOperand(0), SrcMO2, isVReg64); + unpackedInstrs[0]->addRegisterKilled( + unpackedInstrs[0]->getOperand(1).getReg(), TRI); + unpackedInstrs[1]->addRegisterKilled( + unpackedInstrs[1]->getOperand(1).getReg(), TRI); return; } bool isVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID); - SmallVector unpackedInstrs = insertUnpackedMI( - I, DstMO, SrcMO1, SrcMO2, SrcMO1, - SrcMO2, isVReg64); + SmallVector unpackedInstrs = + insertUnpackedMI(I, DstMO, SrcMO1, SrcMO2, SrcMO1, SrcMO2, isVReg64); return; } -void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I, uint16_t AvailableBudget) { +void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I, + uint16_t AvailableBudget) { MachineBasicBlock &MBB = *I.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -585,7 +614,7 @@ void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I, uint16_t Av Register SrcReg1 = SrcMO1.getReg(); const DebugLoc &DL = I.getDebugLoc(); - + const TargetRegisterClass *RC = &AMDGPU::VGPR_32RegClass; auto SchedModel = TII->getSchedModel(); @@ -595,24 +624,25 @@ void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I, uint16_t Av auto BuildImm = [&](uint32_t Val) -> std::pair { Register ImmReg = MRI.createVirtualRegister(RC); auto newMI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), ImmReg) - .addImm(Val); + .addImm(Val); LIS->InsertMachineInstrInMaps(*newMI); - const MCSchedClassDesc *SchedClassDesc = SchedModel.resolveSchedClass(newMI); - uint16_t LatencyCycles = SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle; + const MCSchedClassDesc *SchedClassDesc = + SchedModel.resolveSchedClass(newMI); + uint16_t LatencyCycles = + SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle; return {ImmReg, LatencyCycles}; }; - if (!IsF16MaskSet) { std::pair RegAndLatency = BuildImm(0x0000FFFF); - MaskLo = RegAndLatency.first; //mask for lower 16 bits + MaskLo = RegAndLatency.first; // mask for lower 16 bits AddlCyclesConsumed += RegAndLatency.second; RegAndLatency = BuildImm(16); - ShiftAmt = RegAndLatency.first; //mask for higher 16 bits + ShiftAmt = RegAndLatency.first; // mask for higher 16 bits AddlCyclesConsumed += RegAndLatency.second; IsF16MaskSet = true; } - + Register Src0_Lo = MRI.createVirtualRegister(RC); Register Src1_Lo = MRI.createVirtualRegister(RC); Register Src0_Hi = MRI.createVirtualRegister(RC); @@ -624,27 +654,33 @@ void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I, uint16_t Av if (SrcMO0.getSubReg()) SubRegID = SrcMO0.getSubReg(); - int src0_modifiers_Idx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src0_modifiers); - int src1_modifiers_Idx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src1_modifiers); + int src0_modifiers_Idx = + AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src0_modifiers); + int src1_modifiers_Idx = + AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src1_modifiers); unsigned src0_Mods = I.getOperand(src0_modifiers_Idx).getImm(); unsigned src1_Mods = I.getOperand(src1_modifiers_Idx).getImm(); - int clampIdx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::clamp); + int clampIdx = + AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::clamp); int64_t clampVal = I.getOperand(clampIdx).getImm(); // handle op_sel for src0 if (src0_Mods & SISrcMods::OP_SEL_0) { - // if op_sel is set, select higher 16 bits and copy into lower 16 bits of new vgpr - MachineInstrBuilder LoInput0_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), Src0_Lo) - .addReg(ShiftAmt); + // if op_sel is set, select higher 16 bits and copy into lower 16 bits of + // new vgpr + MachineInstrBuilder LoInput0_MI = + BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), Src0_Lo) + .addReg(ShiftAmt); if (SubRegID) LoInput0_MI.addReg(SrcReg0, 0, SubRegID); else LoInput0_MI.addReg(SrcReg0); LIS->InsertMachineInstrInMaps(*LoInput0_MI); - } - else { - // if op_sel is not set, select lower 16 bits and copy into lower 16 bits of new vgpr - MachineInstrBuilder LoInput0_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_AND_B32_e32), Src0_Lo); + } else { + // if op_sel is not set, select lower 16 bits and copy into lower 16 bits of + // new vgpr + MachineInstrBuilder LoInput0_MI = + BuildMI(MBB, I, DL, TII->get(AMDGPU::V_AND_B32_e32), Src0_Lo); if (SubRegID) LoInput0_MI.addReg(SrcReg0, 0, SubRegID); else @@ -655,18 +691,21 @@ void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I, uint16_t Av // handle op_sel_hi for src0 if (src0_Mods & SISrcMods::OP_SEL_1) { - // if op_sel_hi is set, select higher 16 bits and copy into lower 16 bits of new vgpr - MachineInstrBuilder HiInput0_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), Src0_Hi) - .addReg(ShiftAmt); + // if op_sel_hi is set, select higher 16 bits and copy into lower 16 bits of + // new vgpr + MachineInstrBuilder HiInput0_MI = + BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), Src0_Hi) + .addReg(ShiftAmt); if (SubRegID) HiInput0_MI.addReg(SrcReg0, 0, SubRegID); else HiInput0_MI.addReg(SrcReg0); LIS->InsertMachineInstrInMaps(*HiInput0_MI); - } - else { - // if op_sel_hi is not set, select lower 16 bits and copy into lower 16 bits of new vgpr - MachineInstrBuilder HiInput0_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_AND_B32_e32), Src0_Hi); + } else { + // if op_sel_hi is not set, select lower 16 bits and copy into lower 16 bits + // of new vgpr + MachineInstrBuilder HiInput0_MI = + BuildMI(MBB, I, DL, TII->get(AMDGPU::V_AND_B32_e32), Src0_Hi); if (SubRegID) HiInput0_MI.addReg(SrcReg0, 0, SubRegID); else @@ -680,18 +719,21 @@ void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I, uint16_t Av SubRegID = SrcMO1.getSubReg(); // handle op_sel for src1 if (src1_Mods & SISrcMods::OP_SEL_0) { - // if op_sel is set, select higher 16 bits and copy into lower 16 bits of new vgpr - MachineInstrBuilder LoInput1_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), Src1_Lo) - .addReg(ShiftAmt); + // if op_sel is set, select higher 16 bits and copy into lower 16 bits of + // new vgpr + MachineInstrBuilder LoInput1_MI = + BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), Src1_Lo) + .addReg(ShiftAmt); if (SubRegID) LoInput1_MI.addReg(SrcReg1, 0, SubRegID); else LoInput1_MI.addReg(SrcReg1); LIS->InsertMachineInstrInMaps(*LoInput1_MI); - } - else { - // if op_sel is not set, select lower 16 bits and copy into lower 16 bits of new vgpr - MachineInstrBuilder LoInput1_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_AND_B32_e32), Src1_Lo); + } else { + // if op_sel is not set, select lower 16 bits and copy into lower 16 bits of + // new vgpr + MachineInstrBuilder LoInput1_MI = + BuildMI(MBB, I, DL, TII->get(AMDGPU::V_AND_B32_e32), Src1_Lo); if (SubRegID) LoInput1_MI.addReg(SrcReg1, 0, SubRegID); else @@ -702,18 +744,21 @@ void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I, uint16_t Av // handle op_sel_hi for src1 if (src1_Mods & SISrcMods::OP_SEL_1) { - // if op_sel_hi is set, select higher 16 bits and copy into lower 16 bits of new vgpr - MachineInstrBuilder HiInput1_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), Src1_Hi) - .addReg(ShiftAmt); + // if op_sel_hi is set, select higher 16 bits and copy into lower 16 bits of + // new vgpr + MachineInstrBuilder HiInput1_MI = + BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), Src1_Hi) + .addReg(ShiftAmt); if (SubRegID) HiInput1_MI.addReg(SrcReg1, 0, SubRegID); else HiInput1_MI.addReg(SrcReg1); LIS->InsertMachineInstrInMaps(*HiInput1_MI); - } - else { - // if op_sel_hi is not set, select lower 16 bits and copy into lower 16 bits of new vgpr - MachineInstrBuilder HiInput1_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_AND_B32_e32), Src1_Hi); + } else { + // if op_sel_hi is not set, select lower 16 bits and copy into lower 16 bits + // of new vgpr + MachineInstrBuilder HiInput1_MI = + BuildMI(MBB, I, DL, TII->get(AMDGPU::V_AND_B32_e32), Src1_Hi); if (SubRegID) HiInput1_MI.addReg(SrcReg1, 0, SubRegID); else @@ -728,75 +773,81 @@ void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I, uint16_t Av unsigned Lo_src0_mods = 0; unsigned Lo_src1_mods = 0; uint16_t unpackedOpcode = mapToUnpackedOpcode(I); - + // Unpacked instructions - MachineInstrBuilder LoMul_MI = BuildMI(MBB, I, DL, TII->get(unpackedOpcode), LoMul); + MachineInstrBuilder LoMul_MI = + BuildMI(MBB, I, DL, TII->get(unpackedOpcode), LoMul); - if (src0_Mods & SISrcMods::NEG) + if (src0_Mods & SISrcMods::NEG) Lo_src0_mods |= SISrcMods::NEG; - LoMul_MI.addImm(Lo_src0_mods); //src0_modifiers - LoMul_MI.addReg(Src0_Lo, RegState::Kill); //src0 + LoMul_MI.addImm(Lo_src0_mods); // src0_modifiers + LoMul_MI.addReg(Src0_Lo, RegState::Kill); // src0 if (src1_Mods & SISrcMods::NEG) Lo_src1_mods |= SISrcMods::NEG; - LoMul_MI.addImm(Lo_src1_mods); //src1_modifiers - LoMul_MI.addReg(Src1_Lo, RegState::Kill); //src1 - LoMul_MI.addImm(clampVal); //clamp - //packed instructions do not support output modifiers. safe to assign them 0 for this use case - LoMul_MI.addImm(0); //omod + LoMul_MI.addImm(Lo_src1_mods); // src1_modifiers + LoMul_MI.addReg(Src1_Lo, RegState::Kill); // src1 + LoMul_MI.addImm(clampVal); // clamp + // packed instructions do not support output modifiers. safe to assign them 0 + // for this use case + LoMul_MI.addImm(0); // omod - // unpacked instruction with VOP3 encoding for Hi bits + // unpacked instruction with VOP3 encoding for Hi bits unsigned Hi_src0_mods = 0; unsigned Hi_src1_mods = 0; - MachineInstrBuilder HiMul_MI = BuildMI(MBB, I, DL, TII->get(unpackedOpcode), HiMul); - if (src0_Mods & SISrcMods::NEG_HI) + MachineInstrBuilder HiMul_MI = + BuildMI(MBB, I, DL, TII->get(unpackedOpcode), HiMul); + if (src0_Mods & SISrcMods::NEG_HI) Hi_src0_mods |= SISrcMods::NEG_HI; - - HiMul_MI.addImm(Hi_src0_mods); //src0_modifiers - HiMul_MI.addReg(Src0_Hi, RegState::Kill); //select higher 16 bits if op_sel_hi is set + + HiMul_MI.addImm(Hi_src0_mods); // src0_modifiers + HiMul_MI.addReg(Src0_Hi, + RegState::Kill); // select higher 16 bits if op_sel_hi is set if (src1_Mods & SISrcMods::NEG_HI) Hi_src1_mods |= SISrcMods::NEG_HI; - - HiMul_MI.addImm(Hi_src1_mods); //src0_modifiers - HiMul_MI.addReg(Src1_Hi, RegState::Kill); //select higher 16 bits from src1 if op_sel_hi is set - HiMul_MI.addImm(clampVal); //clamp - //packed instructions do not support output modifiers. safe to assign them 0 for this use case - HiMul_MI.addImm(0); //omod + + HiMul_MI.addImm(Hi_src1_mods); // src0_modifiers + HiMul_MI.addReg( + Src1_Hi, + RegState::Kill); // select higher 16 bits from src1 if op_sel_hi is set + HiMul_MI.addImm(clampVal); // clamp + // packed instructions do not support output modifiers. safe to assign them 0 + // for this use case + HiMul_MI.addImm(0); // omod // Shift HiMul left by 16 Register HiMulShifted = MRI.createVirtualRegister(RC); - MachineInstrBuilder HiMulShifted_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHLREV_B32_e64), HiMulShifted) - .addReg(ShiftAmt) - .addReg(HiMul); + MachineInstrBuilder HiMulShifted_MI = + BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHLREV_B32_e64), HiMulShifted) + .addReg(ShiftAmt) + .addReg(HiMul); SubRegID = 0; if (DstMO.getSubReg()) SubRegID = DstMO.getSubReg(); // OR LoMul | (HiMul << 16) - MachineInstrBuilder RewriteBackToDst_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_OR_B32_e64)); + MachineInstrBuilder RewriteBackToDst_MI = + BuildMI(MBB, I, DL, TII->get(AMDGPU::V_OR_B32_e64)); if (SubRegID) { - if (DstMO.isUndef()){ + if (DstMO.isUndef()) { RewriteBackToDst_MI.addDef(DstReg, RegState::Undef, SubRegID); - } - else { + } else { RewriteBackToDst_MI.addDef(DstReg, 0, SubRegID); } - } - else { - if (DstMO.isUndef()){ + } else { + if (DstMO.isUndef()) { RewriteBackToDst_MI.addDef(DstReg, RegState::Undef); - } - else { + } else { RewriteBackToDst_MI.addDef(DstReg); } } RewriteBackToDst_MI.addReg(LoMul); RewriteBackToDst_MI.addReg(HiMulShifted); - + LIS->InsertMachineInstrInMaps(*LoMul_MI); LIS->InsertMachineInstrInMaps(*HiMul_MI); LIS->InsertMachineInstrInMaps(*HiMulShifted_MI); @@ -805,7 +856,6 @@ void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I, uint16_t Av I.eraseFromParent(); LIS->removeInterval(DstReg); LIS->createAndComputeVirtRegInterval(DstReg); - } bool GCNPreRAOptimizationsLegacy::runOnMachineFunction(MachineFunction &MF) { @@ -844,19 +894,22 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) { } // Add RA hints to improve True16 COPY elimination. - // Unpack packed instructions to overlap MFMAs. This allows the compiler to co-issue unpacked instructions with MFMA + // Unpack packed instructions to overlap MFMAs. This allows the compiler to + // co-issue unpacked instructions with MFMA for (MachineBasicBlock &MBB : MF) { SetVector instrsToUnpack; IsF16MaskSet = false; uint16_t NumMFMACycles = 0; auto SchedModel = TII->getSchedModel(); for (MachineInstr &MI : MBB) { - if (SIInstrInfo::isMFMA(MI)){ - const MCSchedClassDesc *SchedClassDesc = SchedModel.resolveSchedClass(&MI); - NumMFMACycles = SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle; + if (SIInstrInfo::isMFMA(MI)) { + const MCSchedClassDesc *SchedClassDesc = + SchedModel.resolveSchedClass(&MI); + NumMFMACycles = + SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle; createListOfPackedInstr(MI, instrsToUnpack, NumMFMACycles); } - if (ST.useRealTrue16Insts()){ + if (ST.useRealTrue16Insts()) { if (MI.getOpcode() != AMDGPU::COPY) continue; Register Dst = MI.getOperand(0).getReg(); @@ -883,13 +936,13 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) { MRI->setRegAllocationHint(Dst, AMDGPURI::Size16, Src); } } - + if (!instrsToUnpack.empty()) { for (MachineInstr *MI : instrsToUnpack) { - if ((MI->getOpcode() == AMDGPU::V_PK_MUL_F16) || (MI->getOpcode() == AMDGPU::V_PK_ADD_F16)) { + if ((MI->getOpcode() == AMDGPU::V_PK_MUL_F16) || + (MI->getOpcode() == AMDGPU::V_PK_ADD_F16)) { processF16Unpacking(*MI, NumMFMACycles); - } - else { + } else { insertMI(*MI); } } diff --git a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.mir b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.mir new file mode 100644 index 0000000000000..b13f61a963ed5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.mir @@ -0,0 +1,209 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -march=amdgcn -mcpu=gfx950 -run-pass=amdgpu-pre-ra-optimizations -o - %s | FileCheck -check-prefix=GCN %s + +--- +name: test_pk_mul_unpacking_f32 +tracksRegLiveness: true + +liveins: + - { reg: '$sgpr4_sgpr5', virtual-reg: '%3' } + +body: | + bb.0.entry: + liveins: $sgpr4_sgpr5 + + ; GCN-LABEL: name: test_pk_mul_unpacking_f32 + ; GCN: liveins: $sgpr4_sgpr5 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; GCN-NEXT: early-clobber %1:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[COPY]](p4), 0, 0 + ; GCN-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %1.sub4_sub5, 0, 0 + ; GCN-NEXT: [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %1.sub6_sub7, 0, 0 + ; GCN-NEXT: early-clobber %4:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %1.sub6_sub7, 0, 0 + ; GCN-NEXT: KILL %1.sub6_sub7 + ; GCN-NEXT: early-clobber %5:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %1.sub4_sub5, 0, 0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[S_LOAD_DWORDX4_IMM]] + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[S_LOAD_DWORDX4_IMM1]] + ; GCN-NEXT: undef [[COPY3:%[0-9]+]].sub0:vreg_64_align2 = COPY %4.sub6 + ; GCN-NEXT: [[COPY3:%[0-9]+]].sub1:vreg_64_align2 = COPY %4.sub7 + ; GCN-NEXT: undef [[COPY4:%[0-9]+]].sub0:vreg_64_align2 = COPY %4.sub4 + ; GCN-NEXT: undef [[V_PK_MUL_F32_:%[0-9]+]].sub2_sub3:vreg_128_align2 = nofpexcept V_PK_MUL_F32 8, %5.sub6_sub7, 8, [[COPY3]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: dead early-clobber %11:areg_512_align2 = V_MFMA_F32_32X32X16_F16_e64 [[COPY1]], [[COPY2]], 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[COPY4:%[0-9]+]].sub1:vreg_64_align2 = COPY %4.sub5 + ; GCN-NEXT: [[V_PK_MUL_F32_:%[0-9]+]].sub0:vreg_128_align2 = nofpexcept V_MUL_F32_e64 0, %5.sub4, 0, [[COPY4]].sub0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: dead [[V_PK_MUL_F32_:%[0-9]+]].sub1:vreg_128_align2 = nofpexcept V_MUL_F32_e64 0, %5.sub5, 0, [[COPY4]].sub1, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + %3:sgpr_64(p4) = COPY $sgpr4_sgpr5 + early-clobber %8:sgpr_256 = S_LOAD_DWORDX8_IMM_ec %3(p4), 0, 0 + %22:sgpr_128 = S_LOAD_DWORDX4_IMM %8.sub4_sub5, 0, 0 + %23:sgpr_128 = S_LOAD_DWORDX4_IMM %8.sub6_sub7, 0, 0 + early-clobber %39:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %8.sub6_sub7, 0, 0 + KILL %8.sub6_sub7 + early-clobber %24:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %8.sub4_sub5, 0, 0 + %57:vreg_128_align2 = COPY %22 + %58:vreg_128_align2 = COPY %23 + undef %69.sub0:vreg_64_align2 = COPY %39.sub6 + %69.sub1:vreg_64_align2 = COPY %39.sub7 + undef %75.sub0:vreg_64_align2 = COPY %39.sub4 + undef %179.sub2_sub3:vreg_128_align2 = nofpexcept V_PK_MUL_F32 8, %24.sub6_sub7, 8, %69, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %56:areg_512_align2 = V_MFMA_F32_32X32X16_F16_e64 %57, %58, 0, 0, 0, 0, implicit $mode, implicit $exec + %75.sub1:vreg_64_align2 = COPY %39.sub5 + %179.sub0_sub1:vreg_128_align2 = nofpexcept V_PK_MUL_F32 8, %24.sub4_sub5:sgpr_512, 8, %75:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 + +... +--- +name: test_op_sel_selection_unpacking_f32 +tracksRegLiveness: true + +liveins: + - { reg: '$sgpr4_sgpr5', virtual-reg: '%3' } + +body: | + bb.0.entry: + liveins: $sgpr4_sgpr5 + + ; GCN-LABEL: name: test_op_sel_selection_unpacking_f32 + ; GCN: liveins: $sgpr4_sgpr5 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; GCN-NEXT: early-clobber %1:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[COPY]](p4), 0, 0 + ; GCN-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %1.sub4_sub5, 0, 0 + ; GCN-NEXT: [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %1.sub6_sub7, 0, 0 + ; GCN-NEXT: early-clobber %4:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %1.sub6_sub7, 0, 0 + ; GCN-NEXT: KILL %1.sub6_sub7 + ; GCN-NEXT: early-clobber %5:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %1.sub4_sub5, 0, 0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[S_LOAD_DWORDX4_IMM]] + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[S_LOAD_DWORDX4_IMM1]] + ; GCN-NEXT: undef [[COPY3:%[0-9]+]].sub0:vreg_64_align2 = COPY %4.sub6 + ; GCN-NEXT: [[COPY3:%[0-9]+]].sub1:vreg_64_align2 = COPY %4.sub7 + ; GCN-NEXT: undef [[COPY4:%[0-9]+]].sub0:vreg_64_align2 = COPY %4.sub4 + ; GCN-NEXT: undef [[V_PK_MUL_F32_:%[0-9]+]].sub2_sub3:vreg_128_align2 = nofpexcept V_PK_MUL_F32 8, %5.sub6_sub7, 8, [[COPY3]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: dead early-clobber %11:areg_512_align2 = V_MFMA_F32_32X32X16_F16_e64 [[COPY1]], [[COPY2]], 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[COPY4:%[0-9]+]].sub1:vreg_64_align2 = COPY %4.sub5 + ; GCN-NEXT: [[V_PK_MUL_F32_:%[0-9]+]].sub0:vreg_128_align2 = nofpexcept V_MUL_F32_e64 0, %5.sub4, 0, [[COPY4]].sub1, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: dead [[V_PK_MUL_F32_:%[0-9]+]].sub1:vreg_128_align2 = nofpexcept V_MUL_F32_e64 0, %5.sub5, 0, [[COPY4]].sub1, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + %3:sgpr_64(p4) = COPY $sgpr4_sgpr5 + early-clobber %8:sgpr_256 = S_LOAD_DWORDX8_IMM_ec %3(p4), 0, 0 + %22:sgpr_128 = S_LOAD_DWORDX4_IMM %8.sub4_sub5, 0, 0 + %23:sgpr_128 = S_LOAD_DWORDX4_IMM %8.sub6_sub7, 0, 0 + early-clobber %39:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %8.sub6_sub7, 0, 0 + KILL %8.sub6_sub7 + early-clobber %24:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %8.sub4_sub5, 0, 0 + %57:vreg_128_align2 = COPY %22 + %58:vreg_128_align2 = COPY %23 + undef %69.sub0:vreg_64_align2 = COPY %39.sub6 + %69.sub1:vreg_64_align2 = COPY %39.sub7 + undef %75.sub0:vreg_64_align2 = COPY %39.sub4 + undef %179.sub2_sub3:vreg_128_align2 = nofpexcept V_PK_MUL_F32 8, %24.sub6_sub7, 8, %69, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %56:areg_512_align2 = V_MFMA_F32_32X32X16_F16_e64 %57, %58, 0, 0, 0, 0, implicit $mode, implicit $exec + %75.sub1:vreg_64_align2 = COPY %39.sub5 + %179.sub0_sub1:vreg_128_align2 = nofpexcept V_PK_MUL_F32 8, %24.sub4_sub5:sgpr_512, 12, %75:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 + +... +--- +name: test_op_sel_hi_selection_unpacking_f32 +tracksRegLiveness: true + +liveins: + - { reg: '$sgpr4_sgpr5', virtual-reg: '%3' } + +body: | + bb.0.entry: + liveins: $sgpr4_sgpr5 + ; GCN-LABEL: name: test_op_sel_hi_selection_unpacking_f32 + ; GCN: liveins: $sgpr4_sgpr5 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; GCN-NEXT: early-clobber %1:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[COPY]](p4), 0, 0 + ; GCN-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %1.sub4_sub5, 0, 0 + ; GCN-NEXT: [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %1.sub6_sub7, 0, 0 + ; GCN-NEXT: early-clobber %4:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %1.sub6_sub7, 0, 0 + ; GCN-NEXT: KILL %1.sub6_sub7 + ; GCN-NEXT: early-clobber %5:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %1.sub4_sub5, 0, 0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[S_LOAD_DWORDX4_IMM]] + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[S_LOAD_DWORDX4_IMM1]] + ; GCN-NEXT: undef [[COPY3:%[0-9]+]].sub0:vreg_64_align2 = COPY %4.sub6 + ; GCN-NEXT: [[COPY3:%[0-9]+]].sub1:vreg_64_align2 = COPY %4.sub7 + ; GCN-NEXT: undef [[COPY4:%[0-9]+]].sub0:vreg_64_align2 = COPY %4.sub4 + ; GCN-NEXT: undef [[V_PK_MUL_F32_:%[0-9]+]].sub2_sub3:vreg_128_align2 = nofpexcept V_PK_MUL_F32 8, %5.sub6_sub7, 8, [[COPY3]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: dead early-clobber %11:areg_512_align2 = V_MFMA_F32_32X32X16_F16_e64 [[COPY1]], [[COPY2]], 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[COPY4:%[0-9]+]].sub1:vreg_64_align2 = COPY %4.sub5 + ; GCN-NEXT: [[V_PK_MUL_F32_:%[0-9]+]].sub0:vreg_128_align2 = nofpexcept V_MUL_F32_e64 0, %5.sub4, 0, [[COPY4]].sub0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: dead [[V_PK_MUL_F32_:%[0-9]+]].sub1:vreg_128_align2 = nofpexcept V_MUL_F32_e64 0, %5.sub4, 0, [[COPY4]].sub1, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + %3:sgpr_64(p4) = COPY $sgpr4_sgpr5 + early-clobber %8:sgpr_256 = S_LOAD_DWORDX8_IMM_ec %3(p4), 0, 0 + %22:sgpr_128 = S_LOAD_DWORDX4_IMM %8.sub4_sub5, 0, 0 + %23:sgpr_128 = S_LOAD_DWORDX4_IMM %8.sub6_sub7, 0, 0 + early-clobber %39:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %8.sub6_sub7, 0, 0 + KILL %8.sub6_sub7 + early-clobber %24:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %8.sub4_sub5, 0, 0 + %57:vreg_128_align2 = COPY %22 + %58:vreg_128_align2 = COPY %23 + undef %69.sub0:vreg_64_align2 = COPY %39.sub6 + %69.sub1:vreg_64_align2 = COPY %39.sub7 + undef %75.sub0:vreg_64_align2 = COPY %39.sub4 + undef %179.sub2_sub3:vreg_128_align2 = nofpexcept V_PK_MUL_F32 8, %24.sub6_sub7, 8, %69, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %56:areg_512_align2 = V_MFMA_F32_32X32X16_F16_e64 %57, %58, 0, 0, 0, 0, implicit $mode, implicit $exec + %75.sub1:vreg_64_align2 = COPY %39.sub5 + %179.sub0_sub1:vreg_128_align2 = nofpexcept V_PK_MUL_F32 0, %24.sub4_sub5:sgpr_512, 8, %75:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 + +... +--- +name: test_only_overlapped_unpacking_f16 +tracksRegLiveness: true +liveins: + - { reg: '$sgpr4_sgpr5', virtual-reg: '%3' } +body: | + bb.0.entry: + liveins: $sgpr4_sgpr5 + ; GCN-LABEL: name: test_only_overlapped_unpacking_f16 + ; GCN: liveins: $sgpr4_sgpr5 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; GCN-NEXT: early-clobber %1:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[COPY]](p4), 0, 0 + ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1.sub4_sub5, 0, 0 + ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1.sub6_sub7, 0, 0 + ; GCN-NEXT: early-clobber %4:sgpr_256 = S_LOAD_DWORDX8_IMM_ec %1.sub6_sub7, 0, 0 + ; GCN-NEXT: dead [[AV_MOV_:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GCN-NEXT: early-clobber %6:sgpr_256 = S_LOAD_DWORDX8_IMM_ec %1.sub4_sub5, 0, 0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[S_LOAD_DWORDX2_IMM]] + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY [[S_LOAD_DWORDX2_IMM1]] + ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY %4.sub7 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY %4.sub6 + ; GCN-NEXT: undef [[V_PK_MUL_F16_:%[0-9]+]].sub3:vreg_128_align2 = nofpexcept V_PK_MUL_F16 8, %6.sub7, 8, [[COPY3]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: dead early-clobber %12:areg_256_align2 = V_MFMA_F64_16X16X4F64_e64 [[COPY1]], [[COPY2]], 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; GCN-NEXT: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 %6.sub6, [[V_MOV_B32_e32_]], implicit $exec + ; GCN-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_1]], %6.sub6, implicit $exec + ; GCN-NEXT: [[V_AND_B32_e32_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 [[COPY4]], [[V_MOV_B32_e32_]], implicit $exec + ; GCN-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_1]], [[COPY4]], implicit $exec + ; GCN-NEXT: [[V_MUL_F16_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F16_e64 0, killed [[V_AND_B32_e32_]], 0, killed [[V_AND_B32_e32_1]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_MUL_F16_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_F16_e64 0, killed [[V_LSHRREV_B32_e64_]], 0, killed [[V_LSHRREV_B32_e64_1]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_1]], [[V_MUL_F16_e64_1]], implicit $exec + ; GCN-NEXT: [[V_PK_MUL_F16_:%[0-9]+]].sub2:vreg_128_align2 = V_OR_B32_e64 [[V_MUL_F16_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY %4.sub5 + ; GCN-NEXT: dead [[V_PK_MUL_F16_:%[0-9]+]].sub1:vreg_128_align2 = nofpexcept V_PK_MUL_F16 8, %6.sub5, 8, [[COPY5]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + %3:sgpr_64(p4) = COPY $sgpr4_sgpr5 + early-clobber %8:sgpr_256 = S_LOAD_DWORDX8_IMM_ec %3(p4), 0, 0 + %22:sreg_64_xexec = S_LOAD_DWORDX2_IMM %8.sub4_sub5, 0, 0 + %23:sreg_64_xexec = S_LOAD_DWORDX2_IMM %8.sub6_sub7, 0, 0 + early-clobber %25:sgpr_256 = S_LOAD_DWORDX8_IMM_ec %8.sub6_sub7, 0, 0 + %12:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + early-clobber %24:sgpr_256 = S_LOAD_DWORDX8_IMM_ec %8.sub4_sub5, 0, 0 + %29:vreg_64_align2 = COPY %22 + %30:vreg_64_align2 = COPY %23 + %51:vgpr_32 = COPY %25.sub7 + %55:vgpr_32 = COPY %25.sub6 + undef %99.sub3:vreg_128_align2 = nofpexcept V_PK_MUL_F16 8, %24.sub7, 8, %51, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %28:areg_256_align2 = V_MFMA_F64_16X16X4F64_e64 %29, %30, 0, 0, 0, 0, implicit $mode, implicit $exec + %99.sub2:vreg_128_align2 = nofpexcept V_PK_MUL_F16 8, %24.sub6, 8, %55, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %59:vgpr_32 = COPY %25.sub5 + %99.sub1:vreg_128_align2 = nofpexcept V_PK_MUL_F16 8, %24.sub5, 8, %59, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 From 7acafc48136cd22d64d3c03b643075e070c97754 Mon Sep 17 00:00:00 2001 From: Akash Dutta Date: Mon, 18 Aug 2025 15:21:58 -0500 Subject: [PATCH 10/16] adding gfx942 supports and code cleanup --- .../Target/AMDGPU/GCNPreRAOptimizations.cpp | 199 ++++++------------ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 6 +- 2 files changed, 68 insertions(+), 137 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp index 9a2f898dcb2de..6ec71324df84e 100644 --- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp +++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp @@ -65,15 +65,14 @@ class GCNPreRAOptimizationsImpl { bool processReg(Register Reg); bool createListOfPackedInstr(MachineInstr &BeginMI, - SetVector &instrsToUnpack, + SetVector &InstrsToUnpack, uint16_t NumMFMACycles); bool isUnpackingSupportedInstr(MachineInstr &MI) const; - void insertMI(MachineInstr &I); + void processF32Unpacking(MachineInstr &I); uint16_t mapToUnpackedOpcode(MachineInstr &I); - SmallVector copyToVregAndInsertMI(MachineInstr &I, - unsigned SGPRSrcPos); + SmallVector - insertUnpackedMI(MachineInstr &I, MachineOperand &DstMO, + insertUnpackedF32MI(MachineInstr &I, MachineOperand &DstMO, MachineOperand &LoSrcMO1, MachineOperand &LoSrcMO2, MachineOperand &HiSrcMO1, MachineOperand &HiSrcMO2, bool isVreg_64); @@ -288,43 +287,8 @@ uint16_t GCNPreRAOptimizationsImpl::mapToUnpackedOpcode(MachineInstr &I) { } } -SmallVector -GCNPreRAOptimizationsImpl::copyToVregAndInsertMI(MachineInstr &I, - unsigned SGPRSrcPos) { - SmallVector MIList; - - MachineBasicBlock &MBB = *I.getParent(); - MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - MachineFunction &MF = *MBB.getParent(); - const DebugLoc &DL = I.getDebugLoc(); - - Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VReg_64_Align2RegClass); - MachineInstr *CopySGPR1 = - BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY)) - .addDef(TmpReg, RegState::Undef) - .addReg(I.getOperand(SGPRSrcPos).getReg(), 0, AMDGPU::sub0); - unsigned SubIdx = TRI->composeSubRegIndices( - AMDGPU::sub0, CopySGPR1->getOperand(0).getSubReg()); - CopySGPR1->getOperand(0).setReg(CopySGPR1->getOperand(0).getReg()); - CopySGPR1->getOperand(0).setSubReg(SubIdx); - LIS->InsertMachineInstrInMaps(*CopySGPR1); - MIList.push_back(CopySGPR1); - - MachineInstr *CopySGPR2 = - BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY)) - .addDef(TmpReg) - .addReg(I.getOperand(SGPRSrcPos).getReg(), 0, AMDGPU::sub1); - SubIdx = TRI->composeSubRegIndices(AMDGPU::sub1, - CopySGPR2->getOperand(0).getSubReg()); - CopySGPR2->getOperand(0).setReg(CopySGPR2->getOperand(0).getReg()); - CopySGPR2->getOperand(0).setSubReg(SubIdx); - LIS->InsertMachineInstrInMaps(*CopySGPR2); - MIList.push_back(CopySGPR2); - return MIList; -} - bool GCNPreRAOptimizationsImpl::createListOfPackedInstr( - MachineInstr &BeginMI, SetVector &instrsToUnpack, + MachineInstr &BeginMI, SetVector &InstrsToUnpack, uint16_t NumMFMACycles) { auto *BB = BeginMI.getParent(); auto *MF = BB->getParent(); @@ -336,10 +300,10 @@ bool GCNPreRAOptimizationsImpl::createListOfPackedInstr( auto SchedModel = TII->getSchedModel(); for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) { MachineInstr &Instr = *I; - const MCSchedClassDesc *instrSchedClassDesc = + const MCSchedClassDesc *InstrSchedClassDesc = SchedModel.resolveSchedClass(&Instr); TotalCyclesBetweenCandidates += - SchedModel.getWriteProcResBegin(instrSchedClassDesc)->ReleaseAtCycle; + SchedModel.getWriteProcResBegin(InstrSchedClassDesc)->ReleaseAtCycle; if (Instr.isMetaInstruction()) continue; @@ -389,16 +353,16 @@ bool GCNPreRAOptimizationsImpl::createListOfPackedInstr( TotalCyclesBetweenCandidates += 1; if (!(TotalCyclesBetweenCandidates > NumMFMACycles)) - instrsToUnpack.insert(&Instr); + InstrsToUnpack.insert(&Instr); } } return true; } -SmallVector GCNPreRAOptimizationsImpl::insertUnpackedMI( +SmallVector GCNPreRAOptimizationsImpl::insertUnpackedF32MI( MachineInstr &I, MachineOperand &DstMO, MachineOperand &LoSrcMO1, MachineOperand &LoSrcMO2, MachineOperand &HiSrcMO1, - MachineOperand &HiSrcMO2, bool isVreg_64) { + MachineOperand &HiSrcMO2, bool IsVreg_64) { SmallVector MIList; MachineBasicBlock &MBB = *I.getParent(); @@ -414,28 +378,27 @@ SmallVector GCNPreRAOptimizationsImpl::insertUnpackedMI( unsigned DestSubIdx = TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub0); - const MCInstrDesc instrDesc = I.getDesc(); + const MCInstrDesc InstrDesc = I.getDesc(); - int clampIdx = + int ClampIdx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::clamp); - int64_t clampVal = I.getOperand(clampIdx).getImm(); + int64_t ClampVal = I.getOperand(ClampIdx).getImm(); - int src0_modifiers_Idx = + int Src0_modifiers_Idx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src0_modifiers); - int src1_modifiers_Idx = + int Src1_modifiers_Idx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src1_modifiers); - unsigned src0_Mods = I.getOperand(src0_modifiers_Idx).getImm(); - unsigned src1_Mods = I.getOperand(src1_modifiers_Idx).getImm(); + unsigned Src0_Mods = I.getOperand(Src0_modifiers_Idx).getImm(); + unsigned Src1_Mods = I.getOperand(Src1_modifiers_Idx).getImm(); - // don't worry about abs values. Packed instructions (VOP3P) do not support - // them + // Packed instructions (VOP3P) do not support abs. It is okay to ignore them. unsigned Lo_src0_mods = 0; unsigned Lo_src1_mods = 0; - uint16_t unpackedOpcode = mapToUnpackedOpcode(I); - MachineInstrBuilder Op0L_Op1L = BuildMI(MBB, I, DL, TII->get(unpackedOpcode)); + uint16_t UnpackedOpcode = mapToUnpackedOpcode(I); + MachineInstrBuilder Op0L_Op1L = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode)); Op0L_Op1L.addDef(DstReg, 0, DestSubIdx); // vdst - if (src0_Mods & SISrcMods::OP_SEL_0) { - if (src0_Mods & SISrcMods::NEG) { + if (Src0_Mods & SISrcMods::OP_SEL_0) { + if (Src0_Mods & SISrcMods::NEG) { Lo_src0_mods |= SISrcMods::NEG; } Op0L_Op1L.addImm(Lo_src0_mods); // src0_modifiers @@ -450,8 +413,8 @@ SmallVector GCNPreRAOptimizationsImpl::insertUnpackedMI( Src0SubIdx); // src0 //if op_sel == 0, select register 0 of // reg:sub0_sub1 } - if (src1_Mods & SISrcMods::OP_SEL_0) { - if (src1_Mods & SISrcMods::NEG) { + if (Src1_Mods & SISrcMods::OP_SEL_0) { + if (Src1_Mods & SISrcMods::NEG) { Lo_src1_mods |= SISrcMods::NEG; } Op0L_Op1L.addImm(Lo_src1_mods); // src0_modifiers @@ -466,12 +429,12 @@ SmallVector GCNPreRAOptimizationsImpl::insertUnpackedMI( Src1SubIdx); // src0 //if op_sel_hi == 0, select register 0 // of reg:sub0_sub1 } - Op0L_Op1L.addImm(clampVal); // clamp + Op0L_Op1L.addImm(ClampVal); // clamp // packed instructions do not support output modifiers. safe to assign them 0 // for this use case Op0L_Op1L.addImm(0); // omod - if (isVreg_64) { + if (IsVreg_64) { Op0L_Op1L->getOperand(0).setIsUndef(); } else if (I.getOperand(0).isUndef()) { Op0L_Op1L->getOperand(0).setIsUndef(); @@ -483,15 +446,14 @@ SmallVector GCNPreRAOptimizationsImpl::insertUnpackedMI( SrcSubIdx2 = TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub1); DestSubIdx = TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub1); - // don't worry about abs values. Packed instructions (VOP3P) do not support - // them + // Packed instructions (VOP3P) do not support abs. It is okay to ignore them. unsigned Hi_src0_mods = 0; unsigned Hi_src1_mods = 0; - MachineInstrBuilder Op0H_Op1H = BuildMI(MBB, I, DL, TII->get(unpackedOpcode)); + MachineInstrBuilder Op0H_Op1H = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode)); Op0H_Op1H.addDef(DstReg, 0, DestSubIdx); // vdst - if (src0_Mods & SISrcMods::OP_SEL_1) { - if (src0_Mods & SISrcMods::NEG_HI) { + if (Src0_Mods & SISrcMods::OP_SEL_1) { + if (Src0_Mods & SISrcMods::NEG_HI) { Hi_src0_mods |= SISrcMods::NEG; } Op0H_Op1H.addImm(Hi_src0_mods); // src0_modifiers @@ -507,8 +469,8 @@ SmallVector GCNPreRAOptimizationsImpl::insertUnpackedMI( // of reg:sub0_sub1 } - if (src1_Mods & SISrcMods::OP_SEL_1) { - if (src1_Mods & SISrcMods::NEG_HI) { + if (Src1_Mods & SISrcMods::OP_SEL_1) { + if (Src1_Mods & SISrcMods::NEG_HI) { Hi_src1_mods |= SISrcMods::NEG; } Op0H_Op1H.addImm(Hi_src1_mods); // src0_modifiers @@ -523,7 +485,7 @@ SmallVector GCNPreRAOptimizationsImpl::insertUnpackedMI( Src1SubIdx); // src0 //if op_sel_hi == 0, select register 0 // of reg:sub0_sub1 } - Op0H_Op1H.addImm(clampVal); // clamp + Op0H_Op1H.addImm(ClampVal); // clamp // packed instructions do not support output modifiers. safe to assign them 0 // for this use case Op0H_Op1H.addImm(0); // omod @@ -542,7 +504,7 @@ SmallVector GCNPreRAOptimizationsImpl::insertUnpackedMI( return MIList; } -void GCNPreRAOptimizationsImpl::insertMI(MachineInstr &I) { +void GCNPreRAOptimizationsImpl::processF32Unpacking(MachineInstr &I) { MachineBasicBlock &MBB = *I.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); MachineFunction &MF = *MBB.getParent(); @@ -563,40 +525,9 @@ void GCNPreRAOptimizationsImpl::insertMI(MachineInstr &I) { TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0); const TargetRegisterClass *SrcRC = TRI->getSubClassWithSubReg(Src0RC, 1); - if (Src1RC->getID() == AMDGPU::SGPR_64RegClassID) { - // try with sgpr32 - SmallVector copyInstrs = copyToVregAndInsertMI(I, 4); - MachineInstr *CopySGPR1 = copyInstrs[0]; - MachineInstr *CopySGPR2 = copyInstrs[1]; - - bool isVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID); - SmallVector unpackedInstrs = - insertUnpackedMI(I, DstMO, SrcMO1, CopySGPR1->getOperand(0), SrcMO1, - CopySGPR2->getOperand(0), isVReg64); - unpackedInstrs[0]->addRegisterKilled( - unpackedInstrs[0]->getOperand(2).getReg(), TRI); - unpackedInstrs[1]->addRegisterKilled( - unpackedInstrs[1]->getOperand(2).getReg(), TRI); - return; - } else if (Src0RC->getID() == AMDGPU::SGPR_64RegClassID) { - SmallVector copyInstrs = copyToVregAndInsertMI(I, 2); - MachineInstr *CopySGPR1 = copyInstrs[0]; - MachineInstr *CopySGPR2 = copyInstrs[1]; - - bool isVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID); - SmallVector unpackedInstrs = - insertUnpackedMI(I, DstMO, CopySGPR1->getOperand(0), SrcMO2, - CopySGPR2->getOperand(0), SrcMO2, isVReg64); - unpackedInstrs[0]->addRegisterKilled( - unpackedInstrs[0]->getOperand(1).getReg(), TRI); - unpackedInstrs[1]->addRegisterKilled( - unpackedInstrs[1]->getOperand(1).getReg(), TRI); - return; - } - - bool isVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID); - SmallVector unpackedInstrs = - insertUnpackedMI(I, DstMO, SrcMO1, SrcMO2, SrcMO1, SrcMO2, isVReg64); + bool IsVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID); + SmallVector UnpackedInstrs = + insertUnpackedF32MI(I, DstMO, SrcMO1, SrcMO2, SrcMO1, SrcMO2, IsVReg64); return; } @@ -623,11 +554,11 @@ void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I, auto BuildImm = [&](uint32_t Val) -> std::pair { Register ImmReg = MRI.createVirtualRegister(RC); - auto newMI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), ImmReg) + auto NewMI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), ImmReg) .addImm(Val); - LIS->InsertMachineInstrInMaps(*newMI); + LIS->InsertMachineInstrInMaps(*NewMI); const MCSchedClassDesc *SchedClassDesc = - SchedModel.resolveSchedClass(newMI); + SchedModel.resolveSchedClass(NewMI); uint16_t LatencyCycles = SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle; return {ImmReg, LatencyCycles}; @@ -636,10 +567,8 @@ void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I, if (!IsF16MaskSet) { std::pair RegAndLatency = BuildImm(0x0000FFFF); MaskLo = RegAndLatency.first; // mask for lower 16 bits - AddlCyclesConsumed += RegAndLatency.second; RegAndLatency = BuildImm(16); ShiftAmt = RegAndLatency.first; // mask for higher 16 bits - AddlCyclesConsumed += RegAndLatency.second; IsF16MaskSet = true; } @@ -654,18 +583,18 @@ void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I, if (SrcMO0.getSubReg()) SubRegID = SrcMO0.getSubReg(); - int src0_modifiers_Idx = + int Src0_modifiers_Idx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src0_modifiers); - int src1_modifiers_Idx = + int Src1_modifiers_Idx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src1_modifiers); - unsigned src0_Mods = I.getOperand(src0_modifiers_Idx).getImm(); - unsigned src1_Mods = I.getOperand(src1_modifiers_Idx).getImm(); - int clampIdx = + unsigned Src0_Mods = I.getOperand(Src0_modifiers_Idx).getImm(); + unsigned Src1_Mods = I.getOperand(Src1_modifiers_Idx).getImm(); + int ClampIdx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::clamp); - int64_t clampVal = I.getOperand(clampIdx).getImm(); + int64_t ClampVal = I.getOperand(ClampIdx).getImm(); // handle op_sel for src0 - if (src0_Mods & SISrcMods::OP_SEL_0) { + if (Src0_Mods & SISrcMods::OP_SEL_0) { // if op_sel is set, select higher 16 bits and copy into lower 16 bits of // new vgpr MachineInstrBuilder LoInput0_MI = @@ -690,7 +619,7 @@ void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I, } // handle op_sel_hi for src0 - if (src0_Mods & SISrcMods::OP_SEL_1) { + if (Src0_Mods & SISrcMods::OP_SEL_1) { // if op_sel_hi is set, select higher 16 bits and copy into lower 16 bits of // new vgpr MachineInstrBuilder HiInput0_MI = @@ -718,7 +647,7 @@ void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I, if (SrcMO0.getSubReg()) SubRegID = SrcMO1.getSubReg(); // handle op_sel for src1 - if (src1_Mods & SISrcMods::OP_SEL_0) { + if (Src1_Mods & SISrcMods::OP_SEL_0) { // if op_sel is set, select higher 16 bits and copy into lower 16 bits of // new vgpr MachineInstrBuilder LoInput1_MI = @@ -743,7 +672,7 @@ void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I, } // handle op_sel_hi for src1 - if (src1_Mods & SISrcMods::OP_SEL_1) { + if (Src1_Mods & SISrcMods::OP_SEL_1) { // if op_sel_hi is set, select higher 16 bits and copy into lower 16 bits of // new vgpr MachineInstrBuilder HiInput1_MI = @@ -772,24 +701,24 @@ void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I, unsigned Lo_src0_mods = 0; unsigned Lo_src1_mods = 0; - uint16_t unpackedOpcode = mapToUnpackedOpcode(I); + uint16_t UnpackedOpcode = mapToUnpackedOpcode(I); // Unpacked instructions MachineInstrBuilder LoMul_MI = - BuildMI(MBB, I, DL, TII->get(unpackedOpcode), LoMul); + BuildMI(MBB, I, DL, TII->get(UnpackedOpcode), LoMul); - if (src0_Mods & SISrcMods::NEG) + if (Src0_Mods & SISrcMods::NEG) Lo_src0_mods |= SISrcMods::NEG; LoMul_MI.addImm(Lo_src0_mods); // src0_modifiers LoMul_MI.addReg(Src0_Lo, RegState::Kill); // src0 - if (src1_Mods & SISrcMods::NEG) + if (Src1_Mods & SISrcMods::NEG) Lo_src1_mods |= SISrcMods::NEG; LoMul_MI.addImm(Lo_src1_mods); // src1_modifiers LoMul_MI.addReg(Src1_Lo, RegState::Kill); // src1 - LoMul_MI.addImm(clampVal); // clamp + LoMul_MI.addImm(ClampVal); // clamp // packed instructions do not support output modifiers. safe to assign them 0 // for this use case LoMul_MI.addImm(0); // omod @@ -799,22 +728,22 @@ void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I, unsigned Hi_src1_mods = 0; MachineInstrBuilder HiMul_MI = - BuildMI(MBB, I, DL, TII->get(unpackedOpcode), HiMul); - if (src0_Mods & SISrcMods::NEG_HI) + BuildMI(MBB, I, DL, TII->get(UnpackedOpcode), HiMul); + if (Src0_Mods & SISrcMods::NEG_HI) Hi_src0_mods |= SISrcMods::NEG_HI; HiMul_MI.addImm(Hi_src0_mods); // src0_modifiers HiMul_MI.addReg(Src0_Hi, RegState::Kill); // select higher 16 bits if op_sel_hi is set - if (src1_Mods & SISrcMods::NEG_HI) + if (Src1_Mods & SISrcMods::NEG_HI) Hi_src1_mods |= SISrcMods::NEG_HI; HiMul_MI.addImm(Hi_src1_mods); // src0_modifiers HiMul_MI.addReg( Src1_Hi, RegState::Kill); // select higher 16 bits from src1 if op_sel_hi is set - HiMul_MI.addImm(clampVal); // clamp + HiMul_MI.addImm(ClampVal); // clamp // packed instructions do not support output modifiers. safe to assign them 0 // for this use case HiMul_MI.addImm(0); // omod @@ -897,7 +826,7 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) { // Unpack packed instructions to overlap MFMAs. This allows the compiler to // co-issue unpacked instructions with MFMA for (MachineBasicBlock &MBB : MF) { - SetVector instrsToUnpack; + SetVector InstrsToUnpack; IsF16MaskSet = false; uint16_t NumMFMACycles = 0; auto SchedModel = TII->getSchedModel(); @@ -907,7 +836,7 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) { SchedModel.resolveSchedClass(&MI); NumMFMACycles = SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle; - createListOfPackedInstr(MI, instrsToUnpack, NumMFMACycles); + createListOfPackedInstr(MI, InstrsToUnpack, NumMFMACycles); } if (ST.useRealTrue16Insts()) { if (MI.getOpcode() != AMDGPU::COPY) @@ -937,13 +866,13 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) { } } - if (!instrsToUnpack.empty()) { - for (MachineInstr *MI : instrsToUnpack) { + if (!InstrsToUnpack.empty()) { + for (MachineInstr *MI : InstrsToUnpack) { if ((MI->getOpcode() == AMDGPU::V_PK_MUL_F16) || (MI->getOpcode() == AMDGPU::V_PK_ADD_F16)) { processF16Unpacking(*MI, NumMFMACycles); } else { - insertMI(*MI); + processF32Unpacking(*MI); } } } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 5562ff590b71d..1f7cd0140b32c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -6174,9 +6174,11 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, bool SIInstrInfo::isNeverCoissue(MachineInstr &MI) const { bool IsGFX950Only = ST.hasGFX950Insts(); - if (!IsGFX950Only) + bool IsGFX940Only = ST.hasGFX940Insts(); + + if (!IsGFX950Only && !IsGFX940Only) return false; - + if (!isVALU(MI)) return false; From cf4cb9e17a7ecc657528dfb11d4fef2d1bd60b87 Mon Sep 17 00:00:00 2001 From: Akash Dutta Date: Thu, 21 Aug 2025 12:18:26 -0500 Subject: [PATCH 11/16] adding pk_fma_f32 support and more code cleanup --- .../Target/AMDGPU/GCNPreRAOptimizations.cpp | 305 ++++++++++++++---- 1 file changed, 239 insertions(+), 66 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp index 6ec71324df84e..8721fc7ec3afc 100644 --- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp +++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp @@ -28,16 +28,16 @@ /// and a VGPR_16. If we use the VGPR_16 that corresponds to the lo16 bits of /// the VGPR_32, the COPY can be completely eliminated. /// -/// Additionally, this pass also unpacks packed instructions (V_PK_MUL_F32 and -/// V_PK_ADD_F32) adjacent to MFMAs such that they can be co-issued. This helps -/// with overlapping MFMA and certain vector instructions in machine schedules -/// and is expected to improve performance. -/// Only those packed instructions are unpacked that are overlapped by the MFMA -/// latency. Rest should remain untouched. +/// Additionally, this pass also unpacks packed instructions (V_PK_MUL_F32/F16, +/// V_PK_ADD_F32/F16, V_PK_FMA_F32) adjacent to MFMAs such that they can be +/// co-issued. This helps with overlapping MFMA and certain vector instructions +/// in machine schedules and is expected to improve performance. Only those +/// packed instructions are unpacked that are overlapped by the MFMA latency. +/// Rest should remain untouched. //===----------------------------------------------------------------------===// -#include "AMDGPU.h" #include "GCNPreRAOptimizations.h" +#include "AMDGPU.h" #include "GCNSchedStrategy.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" @@ -70,13 +70,14 @@ class GCNPreRAOptimizationsImpl { bool isUnpackingSupportedInstr(MachineInstr &MI) const; void processF32Unpacking(MachineInstr &I); uint16_t mapToUnpackedOpcode(MachineInstr &I); - - SmallVector - insertUnpackedF32MI(MachineInstr &I, MachineOperand &DstMO, - MachineOperand &LoSrcMO1, MachineOperand &LoSrcMO2, - MachineOperand &HiSrcMO1, MachineOperand &HiSrcMO2, - bool isVreg_64); + + void insertUnpackedF32MI(MachineInstr &I, MachineOperand &DstMO, + MachineOperand &LoSrcMO1, MachineOperand &LoSrcMO2, + MachineOperand &HiSrcMO1, MachineOperand &HiSrcMO2, + bool isVreg_64); void processF16Unpacking(MachineInstr &I, uint16_t AvailableBudget); + void processFMAF32Unpacking(MachineInstr &I); + bool IsF16MaskSet; Register MaskLo; // mask to extract lower 16 bits for F16 packed instructions Register @@ -261,6 +262,7 @@ bool GCNPreRAOptimizationsImpl::isUnpackingSupportedInstr( case AMDGPU::V_PK_MUL_F32: case AMDGPU::V_PK_MUL_F16: case AMDGPU::V_PK_ADD_F16: + case AMDGPU::V_PK_FMA_F32: return true; default: @@ -282,6 +284,8 @@ uint16_t GCNPreRAOptimizationsImpl::mapToUnpackedOpcode(MachineInstr &I) { return AMDGPU::V_ADD_F16_e64; case AMDGPU::V_PK_MUL_F16: return AMDGPU::V_MUL_F16_e64; + case AMDGPU::V_PK_FMA_F32: + return AMDGPU::V_FMA_F32_e64; default: return std::numeric_limits::max(); } @@ -359,12 +363,11 @@ bool GCNPreRAOptimizationsImpl::createListOfPackedInstr( return true; } -SmallVector GCNPreRAOptimizationsImpl::insertUnpackedF32MI( +void GCNPreRAOptimizationsImpl::insertUnpackedF32MI( MachineInstr &I, MachineOperand &DstMO, MachineOperand &LoSrcMO1, MachineOperand &LoSrcMO2, MachineOperand &HiSrcMO1, MachineOperand &HiSrcMO2, bool IsVreg_64) { - SmallVector MIList; MachineBasicBlock &MBB = *I.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); MachineFunction &MF = *MBB.getParent(); @@ -395,53 +398,49 @@ SmallVector GCNPreRAOptimizationsImpl::insertUnpackedF32MI( unsigned Lo_src0_mods = 0; unsigned Lo_src1_mods = 0; uint16_t UnpackedOpcode = mapToUnpackedOpcode(I); + if (UnpackedOpcode == std::numeric_limits::max()) + return; + MachineInstrBuilder Op0L_Op1L = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode)); Op0L_Op1L.addDef(DstReg, 0, DestSubIdx); // vdst + if (Src0_Mods & SISrcMods::NEG) { + Lo_src0_mods |= SISrcMods::NEG; + } + Op0L_Op1L.addImm(Lo_src0_mods); // src0_modifiers if (Src0_Mods & SISrcMods::OP_SEL_0) { - if (Src0_Mods & SISrcMods::NEG) { - Lo_src0_mods |= SISrcMods::NEG; - } - Op0L_Op1L.addImm(Lo_src0_mods); // src0_modifiers unsigned Src0SubIdx = TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub1); Op0L_Op1L.addReg(LoSrcMO1.getReg(), 0, Src0SubIdx); // src0 } else { - Op0L_Op1L.addImm(Lo_src0_mods); // src0_modifiers unsigned Src0SubIdx = TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub0); Op0L_Op1L.addReg(LoSrcMO1.getReg(), 0, Src0SubIdx); // src0 //if op_sel == 0, select register 0 of // reg:sub0_sub1 } + if (Src1_Mods & SISrcMods::NEG) { + Lo_src1_mods |= SISrcMods::NEG; + } + Op0L_Op1L.addImm(Lo_src1_mods); // src1_modifiers if (Src1_Mods & SISrcMods::OP_SEL_0) { - if (Src1_Mods & SISrcMods::NEG) { - Lo_src1_mods |= SISrcMods::NEG; - } - Op0L_Op1L.addImm(Lo_src1_mods); // src0_modifiers unsigned Src1SubIdx = TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub1); Op0L_Op1L.addReg(LoSrcMO2.getReg(), 0, Src1SubIdx); // src0 } else { - Op0L_Op1L.addImm(Lo_src1_mods); // src0_modifiers unsigned Src1SubIdx = TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub0); - Op0L_Op1L.addReg(LoSrcMO2.getReg(), 0, - Src1SubIdx); // src0 //if op_sel_hi == 0, select register 0 - // of reg:sub0_sub1 + // src0 //if op_sel_hi == 0, select register 0 of reg:sub0_sub1 + Op0L_Op1L.addReg(LoSrcMO2.getReg(), 0, Src1SubIdx); } Op0L_Op1L.addImm(ClampVal); // clamp // packed instructions do not support output modifiers. safe to assign them 0 // for this use case Op0L_Op1L.addImm(0); // omod - if (IsVreg_64) { - Op0L_Op1L->getOperand(0).setIsUndef(); - } else if (I.getOperand(0).isUndef()) { + if (I.getOperand(0).isUndef()) { Op0L_Op1L->getOperand(0).setIsUndef(); } - LIS->InsertMachineInstrInMaps(*Op0L_Op1L); - SrcSubIdx1 = TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub1); SrcSubIdx2 = TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub1); DestSubIdx = TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub1); @@ -449,42 +448,225 @@ SmallVector GCNPreRAOptimizationsImpl::insertUnpackedF32MI( // Packed instructions (VOP3P) do not support abs. It is okay to ignore them. unsigned Hi_src0_mods = 0; unsigned Hi_src1_mods = 0; - MachineInstrBuilder Op0H_Op1H = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode)); Op0H_Op1H.addDef(DstReg, 0, DestSubIdx); // vdst + if (Src0_Mods & SISrcMods::NEG_HI) { + Hi_src0_mods |= SISrcMods::NEG_HI; + } + Op0H_Op1H.addImm(Hi_src0_mods); // src0_modifiers if (Src0_Mods & SISrcMods::OP_SEL_1) { - if (Src0_Mods & SISrcMods::NEG_HI) { - Hi_src0_mods |= SISrcMods::NEG; - } - Op0H_Op1H.addImm(Hi_src0_mods); // src0_modifiers unsigned Src0SubIdx = TRI->composeSubRegIndices(HiSrcMO1.getSubReg(), AMDGPU::sub1); Op0H_Op1H.addReg(HiSrcMO1.getReg(), 0, Src0SubIdx); // src0 } else { - Op0H_Op1H.addImm(Hi_src0_mods); // src0_modifiers unsigned Src0SubIdx = TRI->composeSubRegIndices(HiSrcMO1.getSubReg(), AMDGPU::sub0); - Op0H_Op1H.addReg(HiSrcMO1.getReg(), 0, - Src0SubIdx); // src0 //if op_sel_hi == 0, select register 0 - // of reg:sub0_sub1 + // src0 //if op_sel_hi == 0, select register 0 of reg:sub0_sub1 + Op0H_Op1H.addReg(HiSrcMO1.getReg(), 0, Src0SubIdx); } - + if (Src1_Mods & SISrcMods::NEG_HI) { + Hi_src1_mods |= SISrcMods::NEG_HI; + } + Op0H_Op1H.addImm(Hi_src1_mods); // src1_modifiers if (Src1_Mods & SISrcMods::OP_SEL_1) { - if (Src1_Mods & SISrcMods::NEG_HI) { - Hi_src1_mods |= SISrcMods::NEG; - } - Op0H_Op1H.addImm(Hi_src1_mods); // src0_modifiers unsigned Src1SubIdx = TRI->composeSubRegIndices(HiSrcMO2.getSubReg(), AMDGPU::sub1); Op0H_Op1H.addReg(HiSrcMO2.getReg(), 0, Src1SubIdx); // src0 } else { - Op0H_Op1H.addImm(Hi_src1_mods); // src0_modifiers unsigned Src1SubIdx = TRI->composeSubRegIndices(HiSrcMO2.getSubReg(), AMDGPU::sub0); - Op0H_Op1H.addReg(HiSrcMO2.getReg(), 0, + // src0 //if op_sel_hi == 0, select register 0 of reg:sub0_sub1 + Op0H_Op1H.addReg(HiSrcMO2.getReg(), 0, Src1SubIdx); + } + Op0H_Op1H.addImm(ClampVal); // clamp + // packed instructions do not support output modifiers. safe to assign them 0 + // for this use case + Op0H_Op1H.addImm(0); // omod + LIS->InsertMachineInstrInMaps(*Op0H_Op1H); + + if (I.getFlag(MachineInstr::MIFlag::NoFPExcept)) { + Op0L_Op1L->setFlag(MachineInstr::MIFlag::NoFPExcept); + Op0H_Op1H->setFlag(MachineInstr::MIFlag::NoFPExcept); + } + LIS->RemoveMachineInstrFromMaps(I); + I.eraseFromParent(); + LIS->removeInterval(DstReg); + LIS->createAndComputeVirtRegInterval(DstReg); + return; +} + +void GCNPreRAOptimizationsImpl::processFMAF32Unpacking(MachineInstr &I) { + MachineBasicBlock &MBB = *I.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + MachineFunction &MF = *MBB.getParent(); + + Register DstReg = I.getOperand(0).getReg(); + Register SrcReg1 = I.getOperand(2).getReg(); + Register SrcReg2 = I.getOperand(4).getReg(); + Register SrcReg3 = I.getOperand(6).getReg(); + MachineOperand &DstMO = I.getOperand(0); + MachineOperand &SrcMO1 = I.getOperand(2); + MachineOperand &SrcMO2 = I.getOperand(4); + MachineOperand &SrcMO3 = I.getOperand(6); + + const DebugLoc &DL = I.getDebugLoc(); + const TargetRegisterClass *DstRC = MRI.getRegClass(I.getOperand(0).getReg()); + const TargetRegisterClass *Src0RC = MRI.getRegClass(I.getOperand(2).getReg()); + const TargetRegisterClass *Src1RC = MRI.getRegClass(I.getOperand(4).getReg()); + const TargetRegisterClass *Src2RC = MRI.getRegClass(I.getOperand(6).getReg()); + + bool IsVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID); + + // insertUnpackedF32MI(I, DstMO, SrcMO1, SrcMO2, SrcMO1, SrcMO2, IsVReg64); + unsigned SrcSubIdx1 = + TRI->composeSubRegIndices(SrcMO1.getSubReg(), AMDGPU::sub0); + unsigned SrcSubIdx2 = + TRI->composeSubRegIndices(SrcMO2.getSubReg(), AMDGPU::sub0); + unsigned SrcSubIdx3 = + TRI->composeSubRegIndices(SrcMO3.getSubReg(), AMDGPU::sub0); + unsigned DestSubIdx = + TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub0); + + const MCInstrDesc InstrDesc = I.getDesc(); + int ClampIdx = + AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::clamp); + int64_t ClampVal = I.getOperand(ClampIdx).getImm(); + int Src0_modifiers_Idx = + AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src0_modifiers); + int Src1_modifiers_Idx = + AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src1_modifiers); + int Src2_modifiers_Idx = + AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src2_modifiers); + unsigned Src0_Mods = I.getOperand(Src0_modifiers_Idx).getImm(); + unsigned Src1_Mods = I.getOperand(Src1_modifiers_Idx).getImm(); + unsigned Src2_Mods = I.getOperand(Src2_modifiers_Idx).getImm(); + + // Packed instructions (VOP3P) do not support abs. It is okay to ignore them. + unsigned Lo_src0_mods = 0; + unsigned Lo_src1_mods = 0; + unsigned Lo_src2_mods = 0; + uint16_t UnpackedOpcode = mapToUnpackedOpcode(I); + if (UnpackedOpcode == std::numeric_limits::max()) + return; + + MachineInstrBuilder Op0L_Op1L = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode)); + Op0L_Op1L.addDef(DstReg, 0, DestSubIdx); // vdst + if (Src0_Mods & SISrcMods::NEG) { + Lo_src0_mods |= SISrcMods::NEG; + } + Op0L_Op1L.addImm(Lo_src0_mods); // src0_modifiers + if (Src0_Mods & SISrcMods::OP_SEL_0) { + unsigned Src0SubIdx = + TRI->composeSubRegIndices(SrcMO1.getSubReg(), AMDGPU::sub1); + Op0L_Op1L.addReg(SrcMO1.getReg(), 0, Src0SubIdx); // src0 + } else { + unsigned Src0SubIdx = + TRI->composeSubRegIndices(SrcMO1.getSubReg(), AMDGPU::sub0); + // if op_sel == 0, select register 0 of reg:sub0_sub1 + Op0L_Op1L.addReg(SrcMO1.getReg(), 0, Src0SubIdx); + } + + if (Src1_Mods & SISrcMods::NEG) { + Lo_src1_mods |= SISrcMods::NEG; + } + Op0L_Op1L.addImm(Lo_src1_mods); // src1_modifiers + if (Src1_Mods & SISrcMods::OP_SEL_0) { + unsigned Src1SubIdx = + TRI->composeSubRegIndices(SrcMO2.getSubReg(), AMDGPU::sub1); + Op0L_Op1L.addReg(SrcMO2.getReg(), 0, Src1SubIdx); // src0 + } else { + unsigned Src1SubIdx = + TRI->composeSubRegIndices(SrcMO2.getSubReg(), AMDGPU::sub0); + Op0L_Op1L.addReg(SrcMO2.getReg(), 0, Src1SubIdx); // src0 //if op_sel_hi == 0, select register 0 // of reg:sub0_sub1 } + + if (Src2_Mods & SISrcMods::NEG) { + Lo_src2_mods |= SISrcMods::NEG; + } + Op0L_Op1L.addImm(Lo_src2_mods); // src2_modifiers + if (Src2_Mods & SISrcMods::OP_SEL_0) { + unsigned Src2SubIdx = + TRI->composeSubRegIndices(SrcMO3.getSubReg(), AMDGPU::sub1); + Op0L_Op1L.addReg(SrcMO3.getReg(), 0, Src2SubIdx); + } else { + unsigned Src2SubIdx = + TRI->composeSubRegIndices(SrcMO3.getSubReg(), AMDGPU::sub0); + // if op_sel_hi == 0, select register 0 of reg:sub0_sub1 + Op0L_Op1L.addReg(SrcMO3.getReg(), 0, Src2SubIdx); + } + Op0L_Op1L.addImm(ClampVal); // clamp + // packed instructions do not support output modifiers. safe to assign them 0 + // for this use case + Op0L_Op1L.addImm(0); // omod + + if (I.getOperand(0).isUndef()) { + Op0L_Op1L->getOperand(0).setIsUndef(); + } + + LIS->InsertMachineInstrInMaps(*Op0L_Op1L); + + SrcSubIdx1 = TRI->composeSubRegIndices(SrcMO1.getSubReg(), AMDGPU::sub1); + SrcSubIdx2 = TRI->composeSubRegIndices(SrcMO2.getSubReg(), AMDGPU::sub1); + SrcSubIdx3 = TRI->composeSubRegIndices(SrcMO3.getSubReg(), AMDGPU::sub1); + DestSubIdx = TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub1); + + // Packed instructions (VOP3P) do not support abs. It is safe to ignore them. + unsigned Hi_src0_mods = 0; + unsigned Hi_src1_mods = 0; + unsigned Hi_src2_mods = 0; + + MachineInstrBuilder Op0H_Op1H = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode)); + Op0H_Op1H.addDef(DstReg, 0, DestSubIdx); // vdst + if (Src0_Mods & SISrcMods::NEG_HI) { + Hi_src0_mods |= SISrcMods::NEG_HI; + } + Op0H_Op1H.addImm(Hi_src0_mods); // src0_modifiers + if (Src0_Mods & SISrcMods::OP_SEL_1) { + unsigned Src0SubIdx = + TRI->composeSubRegIndices(SrcMO1.getSubReg(), AMDGPU::sub1); + Op0H_Op1H.addReg(SrcMO1.getReg(), 0, Src0SubIdx); // src0 + } else { + unsigned Src0SubIdx = + TRI->composeSubRegIndices(SrcMO1.getSubReg(), AMDGPU::sub0); + // src0 //if op_sel_hi == 0, select register 0 of reg:sub0_sub1 + Op0H_Op1H.addReg(SrcMO1.getReg(), 0, Src0SubIdx); + } + + if (Src1_Mods & SISrcMods::NEG_HI) { + Hi_src1_mods |= SISrcMods::NEG_HI; + } + Op0H_Op1H.addImm(Hi_src1_mods); // src0_modifiers + + if (Src1_Mods & SISrcMods::OP_SEL_1) { + unsigned Src1SubIdx = + TRI->composeSubRegIndices(SrcMO2.getSubReg(), AMDGPU::sub1); + Op0H_Op1H.addReg(SrcMO2.getReg(), 0, Src1SubIdx); // src0 + } else { + Op0H_Op1H.addImm(Hi_src1_mods); // src1_modifiers + unsigned Src1SubIdx = + TRI->composeSubRegIndices(SrcMO2.getSubReg(), AMDGPU::sub0); + // if op_sel_hi == 0, select register 0 of reg:sub0_sub1 + Op0H_Op1H.addReg(SrcMO2.getReg(), 0, Src1SubIdx); + } + + if (Src2_Mods & SISrcMods::NEG_HI) { + Hi_src2_mods |= SISrcMods::NEG_HI; + } + Op0H_Op1H.addImm(Hi_src2_mods); // src2_modifiers + + if (Src2_Mods & SISrcMods::OP_SEL_1) { + unsigned Src2SubIdx = + TRI->composeSubRegIndices(SrcMO3.getSubReg(), AMDGPU::sub1); + Op0H_Op1H.addReg(SrcMO3.getReg(), 0, Src2SubIdx); // src0 + } else { + Op0H_Op1H.addImm(Hi_src2_mods); // src2_modifiers + unsigned Src2SubIdx = + TRI->composeSubRegIndices(SrcMO2.getSubReg(), AMDGPU::sub0); + // src0 //if op_sel_hi == 0, select register 0 of reg:sub0_sub1 + Op0H_Op1H.addReg(SrcMO2.getReg(), 0, Src2SubIdx); + } Op0H_Op1H.addImm(ClampVal); // clamp // packed instructions do not support output modifiers. safe to assign them 0 // for this use case @@ -499,12 +681,14 @@ SmallVector GCNPreRAOptimizationsImpl::insertUnpackedF32MI( I.eraseFromParent(); LIS->removeInterval(DstReg); LIS->createAndComputeVirtRegInterval(DstReg); - MIList.push_back(Op0L_Op1L); - MIList.push_back(Op0H_Op1H); - return MIList; + return; } void GCNPreRAOptimizationsImpl::processF32Unpacking(MachineInstr &I) { + if (I.getOpcode() == AMDGPU::V_PK_FMA_F32) { + processFMAF32Unpacking(I); + return; + } MachineBasicBlock &MBB = *I.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); MachineFunction &MF = *MBB.getParent(); @@ -521,13 +705,8 @@ void GCNPreRAOptimizationsImpl::processF32Unpacking(MachineInstr &I) { const TargetRegisterClass *Src0RC = MRI.getRegClass(I.getOperand(2).getReg()); const TargetRegisterClass *Src1RC = MRI.getRegClass(I.getOperand(4).getReg()); - const TargetRegisterClass *Src0SubRC = - TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0); - const TargetRegisterClass *SrcRC = TRI->getSubClassWithSubReg(Src0RC, 1); - bool IsVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID); - SmallVector UnpackedInstrs = - insertUnpackedF32MI(I, DstMO, SrcMO1, SrcMO2, SrcMO1, SrcMO2, IsVReg64); + insertUnpackedF32MI(I, DstMO, SrcMO1, SrcMO2, SrcMO1, SrcMO2, IsVReg64); return; } @@ -535,23 +714,17 @@ void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I, uint16_t AvailableBudget) { MachineBasicBlock &MBB = *I.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - MachineOperand &DstMO = I.getOperand(0); MachineOperand &SrcMO0 = I.getOperand(2); MachineOperand &SrcMO1 = I.getOperand(4); - Register DstReg = DstMO.getReg(); Register SrcReg0 = SrcMO0.getReg(); Register SrcReg1 = SrcMO1.getReg(); - const DebugLoc &DL = I.getDebugLoc(); const TargetRegisterClass *RC = &AMDGPU::VGPR_32RegClass; auto SchedModel = TII->getSchedModel(); - uint16_t AddlCyclesConsumed = 0; - SetVector ListOfNewInstructions; - auto BuildImm = [&](uint32_t Val) -> std::pair { Register ImmReg = MRI.createVirtualRegister(RC); auto NewMI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), ImmReg) @@ -576,8 +749,6 @@ void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I, Register Src1_Lo = MRI.createVirtualRegister(RC); Register Src0_Hi = MRI.createVirtualRegister(RC); Register Src1_Hi = MRI.createVirtualRegister(RC); - Register Input0 = MRI.createVirtualRegister(RC); - Register Input1 = MRI.createVirtualRegister(RC); unsigned SubRegID = 0; if (SrcMO0.getSubReg()) @@ -703,6 +874,8 @@ void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I, unsigned Lo_src1_mods = 0; uint16_t UnpackedOpcode = mapToUnpackedOpcode(I); + if (UnpackedOpcode == std::numeric_limits::max()) + return; // Unpacked instructions MachineInstrBuilder LoMul_MI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode), LoMul); From a77dab90ade2b91ef036d9b072af17e0ca52e162 Mon Sep 17 00:00:00 2001 From: Akash Dutta <137309513+akadutta@users.noreply.github.com> Date: Thu, 21 Aug 2025 12:57:14 -0500 Subject: [PATCH 12/16] fix incorrent merge --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 3358f8ff1d73a..9b8fa25b88f11 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -15,6 +15,7 @@ #include "AMDGPU.h" #include "AMDGPUInstrInfo.h" #include "GCNHazardRecognizer.h" +#include "GCNSubtarget.h" #include "SIMachineFunctionInfo.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/STLExtras.h" From c467ed5b5e89cc6715d7b3501af7bc3ce5c2ea43 Mon Sep 17 00:00:00 2001 From: Akash Dutta Date: Wed, 27 Aug 2025 14:14:38 -0500 Subject: [PATCH 13/16] check dependencies with MFMA inst and code cleanup --- .../Target/AMDGPU/GCNPreRAOptimizations.cpp | 402 ++++++------------ 1 file changed, 136 insertions(+), 266 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp index 8721fc7ec3afc..3bbed5a4d7e8a 100644 --- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp +++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp @@ -77,6 +77,8 @@ class GCNPreRAOptimizationsImpl { bool isVreg_64); void processF16Unpacking(MachineInstr &I, uint16_t AvailableBudget); void processFMAF32Unpacking(MachineInstr &I); + MachineInstrBuilder createUnpackedMI(MachineBasicBlock &MBB, MachineInstr &I, const DebugLoc &DL, uint16_t UnpackedOpcode, bool isHiBits, bool isFMA); + bool hasReadWriteDependencies(const MachineInstr &PredMI, const MachineInstr &SuccMI); bool IsF16MaskSet; Register MaskLo; // mask to extract lower 16 bits for F16 packed instructions @@ -262,9 +264,9 @@ bool GCNPreRAOptimizationsImpl::isUnpackingSupportedInstr( case AMDGPU::V_PK_MUL_F32: case AMDGPU::V_PK_MUL_F16: case AMDGPU::V_PK_ADD_F16: + return (MI.getOperand(2).isReg() && MI.getOperand(4).isReg()); case AMDGPU::V_PK_FMA_F32: - return true; - + return (MI.getOperand(2).isReg() && MI.getOperand(4).isReg() && MI.getOperand(6).isReg()); default: return false; } @@ -291,6 +293,22 @@ uint16_t GCNPreRAOptimizationsImpl::mapToUnpackedOpcode(MachineInstr &I) { } } +bool GCNPreRAOptimizationsImpl::hasReadWriteDependencies(const MachineInstr &PredMI, const MachineInstr &SuccMI) { + for (const MachineOperand &Pred_Ops: PredMI.operands()) { + if (!Pred_Ops.isReg() || !Pred_Ops.isDef()) continue; + Register Pred_Reg = Pred_Ops.getReg(); + if (!Pred_Reg.isValid()) continue; + for (const MachineOperand &Succ_Ops: SuccMI.operands()) { + if (!Succ_Ops.isReg() || !Succ_Ops.isDef()) continue; + Register Succ_Reg = Succ_Ops.getReg(); + if (!Succ_Reg.isValid()) continue; + if ((Pred_Reg == Succ_Reg) || TRI->regsOverlap(Pred_Reg, Succ_Reg)) { + return true; + } + } + } + return false; +} bool GCNPreRAOptimizationsImpl::createListOfPackedInstr( MachineInstr &BeginMI, SetVector &InstrsToUnpack, uint16_t NumMFMACycles) { @@ -308,6 +326,7 @@ bool GCNPreRAOptimizationsImpl::createListOfPackedInstr( SchedModel.resolveSchedClass(&Instr); TotalCyclesBetweenCandidates += SchedModel.getWriteProcResBegin(InstrSchedClassDesc)->ReleaseAtCycle; + if (Instr.isMetaInstruction()) continue; @@ -318,6 +337,9 @@ bool GCNPreRAOptimizationsImpl::createListOfPackedInstr( return false; if ((isUnpackingSupportedInstr(Instr)) && TII->isNeverCoissue(Instr)) { + if (hasReadWriteDependencies(BeginMI, Instr)){ + dbgs() << "## here\n"; + } if ((Instr.getOpcode() == AMDGPU::V_PK_MUL_F16) || (Instr.getOpcode() == AMDGPU::V_PK_ADD_F16)) { // unpacking packed F16 instructions requires multiple instructions. @@ -368,126 +390,72 @@ void GCNPreRAOptimizationsImpl::insertUnpackedF32MI( MachineOperand &LoSrcMO2, MachineOperand &HiSrcMO1, MachineOperand &HiSrcMO2, bool IsVreg_64) { - MachineBasicBlock &MBB = *I.getParent(); - MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - MachineFunction &MF = *MBB.getParent(); + MachineBasicBlock &MBB = *I.getParent(); const DebugLoc &DL = I.getDebugLoc(); Register DstReg = DstMO.getReg(); - unsigned SrcSubIdx1 = - TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub0); - unsigned SrcSubIdx2 = - TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub0); - unsigned DestSubIdx = - TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub0); - - const MCInstrDesc InstrDesc = I.getDesc(); - - int ClampIdx = - AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::clamp); - int64_t ClampVal = I.getOperand(ClampIdx).getImm(); - - int Src0_modifiers_Idx = - AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src0_modifiers); - int Src1_modifiers_Idx = - AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src1_modifiers); - unsigned Src0_Mods = I.getOperand(Src0_modifiers_Idx).getImm(); - unsigned Src1_Mods = I.getOperand(Src1_modifiers_Idx).getImm(); - - // Packed instructions (VOP3P) do not support abs. It is okay to ignore them. - unsigned Lo_src0_mods = 0; - unsigned Lo_src1_mods = 0; uint16_t UnpackedOpcode = mapToUnpackedOpcode(I); if (UnpackedOpcode == std::numeric_limits::max()) return; - MachineInstrBuilder Op0L_Op1L = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode)); - Op0L_Op1L.addDef(DstReg, 0, DestSubIdx); // vdst - if (Src0_Mods & SISrcMods::NEG) { - Lo_src0_mods |= SISrcMods::NEG; - } - Op0L_Op1L.addImm(Lo_src0_mods); // src0_modifiers - if (Src0_Mods & SISrcMods::OP_SEL_0) { - unsigned Src0SubIdx = - TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub1); - Op0L_Op1L.addReg(LoSrcMO1.getReg(), 0, Src0SubIdx); // src0 - } else { - unsigned Src0SubIdx = - TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub0); - Op0L_Op1L.addReg(LoSrcMO1.getReg(), 0, - Src0SubIdx); // src0 //if op_sel == 0, select register 0 of - // reg:sub0_sub1 + MachineInstrBuilder Op0L_Op1L = createUnpackedMI(MBB, I, DL, UnpackedOpcode, false, false); + if (IsVreg_64) { + Op0L_Op1L->getOperand(0).setIsUndef(); + } else if (DstMO.isUndef()) { + Op0L_Op1L->getOperand(0).setIsUndef(); } - if (Src1_Mods & SISrcMods::NEG) { - Lo_src1_mods |= SISrcMods::NEG; + LIS->InsertMachineInstrInMaps(*Op0L_Op1L); + + MachineInstrBuilder Op0H_Op1H = createUnpackedMI(MBB, I, DL, UnpackedOpcode, true, false); + LIS->InsertMachineInstrInMaps(*Op0H_Op1H); + + if (I.getFlag(MachineInstr::MIFlag::NoFPExcept)) { + Op0L_Op1L->setFlag(MachineInstr::MIFlag::NoFPExcept); + Op0H_Op1H->setFlag(MachineInstr::MIFlag::NoFPExcept); } - Op0L_Op1L.addImm(Lo_src1_mods); // src1_modifiers - if (Src1_Mods & SISrcMods::OP_SEL_0) { - unsigned Src1SubIdx = - TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub1); - Op0L_Op1L.addReg(LoSrcMO2.getReg(), 0, Src1SubIdx); // src0 - } else { - unsigned Src1SubIdx = - TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub0); - // src0 //if op_sel_hi == 0, select register 0 of reg:sub0_sub1 - Op0L_Op1L.addReg(LoSrcMO2.getReg(), 0, Src1SubIdx); + if (I.getFlag(MachineInstr::MIFlag::FmContract)) { + Op0L_Op1L->setFlag(MachineInstr::MIFlag::FmContract); + Op0H_Op1H->setFlag(MachineInstr::MIFlag::FmContract); } - Op0L_Op1L.addImm(ClampVal); // clamp - // packed instructions do not support output modifiers. safe to assign them 0 - // for this use case - Op0L_Op1L.addImm(0); // omod - if (I.getOperand(0).isUndef()) { + LIS->RemoveMachineInstrFromMaps(I); + I.eraseFromParent(); + LIS->removeInterval(DstReg); + LIS->createAndComputeVirtRegInterval(DstReg); + return; +} + +void GCNPreRAOptimizationsImpl::processFMAF32Unpacking(MachineInstr &I) { + MachineBasicBlock &MBB = *I.getParent(); + Register DstReg = I.getOperand(0).getReg(); + const DebugLoc &DL = I.getDebugLoc(); + const TargetRegisterClass *DstRC = MRI->getRegClass(I.getOperand(0).getReg()); + bool IsVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID); + + uint16_t UnpackedOpcode = mapToUnpackedOpcode(I); + if (UnpackedOpcode == std::numeric_limits::max()) + return; + + MachineInstrBuilder Op0L_Op1L = createUnpackedMI(MBB, I, DL, UnpackedOpcode, false, true); + if (IsVReg64) + Op0L_Op1L->getOperand(0).setIsUndef(); + else if (I.getOperand(0).isUndef()) { Op0L_Op1L->getOperand(0).setIsUndef(); } LIS->InsertMachineInstrInMaps(*Op0L_Op1L); - SrcSubIdx1 = TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub1); - SrcSubIdx2 = TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub1); - DestSubIdx = TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub1); - // Packed instructions (VOP3P) do not support abs. It is okay to ignore them. - unsigned Hi_src0_mods = 0; - unsigned Hi_src1_mods = 0; - MachineInstrBuilder Op0H_Op1H = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode)); - Op0H_Op1H.addDef(DstReg, 0, DestSubIdx); // vdst - if (Src0_Mods & SISrcMods::NEG_HI) { - Hi_src0_mods |= SISrcMods::NEG_HI; - } - Op0H_Op1H.addImm(Hi_src0_mods); // src0_modifiers - if (Src0_Mods & SISrcMods::OP_SEL_1) { - unsigned Src0SubIdx = - TRI->composeSubRegIndices(HiSrcMO1.getSubReg(), AMDGPU::sub1); - Op0H_Op1H.addReg(HiSrcMO1.getReg(), 0, Src0SubIdx); // src0 - } else { - unsigned Src0SubIdx = - TRI->composeSubRegIndices(HiSrcMO1.getSubReg(), AMDGPU::sub0); - // src0 //if op_sel_hi == 0, select register 0 of reg:sub0_sub1 - Op0H_Op1H.addReg(HiSrcMO1.getReg(), 0, Src0SubIdx); - } - if (Src1_Mods & SISrcMods::NEG_HI) { - Hi_src1_mods |= SISrcMods::NEG_HI; - } - Op0H_Op1H.addImm(Hi_src1_mods); // src1_modifiers - if (Src1_Mods & SISrcMods::OP_SEL_1) { - unsigned Src1SubIdx = - TRI->composeSubRegIndices(HiSrcMO2.getSubReg(), AMDGPU::sub1); - Op0H_Op1H.addReg(HiSrcMO2.getReg(), 0, Src1SubIdx); // src0 - } else { - unsigned Src1SubIdx = - TRI->composeSubRegIndices(HiSrcMO2.getSubReg(), AMDGPU::sub0); - // src0 //if op_sel_hi == 0, select register 0 of reg:sub0_sub1 - Op0H_Op1H.addReg(HiSrcMO2.getReg(), 0, Src1SubIdx); - } - Op0H_Op1H.addImm(ClampVal); // clamp - // packed instructions do not support output modifiers. safe to assign them 0 - // for this use case - Op0H_Op1H.addImm(0); // omod + MachineInstrBuilder Op0H_Op1H = createUnpackedMI(MBB, I, DL, UnpackedOpcode, true, true); LIS->InsertMachineInstrInMaps(*Op0H_Op1H); if (I.getFlag(MachineInstr::MIFlag::NoFPExcept)) { Op0L_Op1L->setFlag(MachineInstr::MIFlag::NoFPExcept); Op0H_Op1H->setFlag(MachineInstr::MIFlag::NoFPExcept); } + if (I.getFlag(MachineInstr::MIFlag::FmContract)) { + Op0L_Op1L->setFlag(MachineInstr::MIFlag::FmContract); + Op0H_Op1H->setFlag(MachineInstr::MIFlag::FmContract); + } + LIS->RemoveMachineInstrFromMaps(I); I.eraseFromParent(); LIS->removeInterval(DstReg); @@ -495,39 +463,15 @@ void GCNPreRAOptimizationsImpl::insertUnpackedF32MI( return; } -void GCNPreRAOptimizationsImpl::processFMAF32Unpacking(MachineInstr &I) { - MachineBasicBlock &MBB = *I.getParent(); - MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - MachineFunction &MF = *MBB.getParent(); - - Register DstReg = I.getOperand(0).getReg(); - Register SrcReg1 = I.getOperand(2).getReg(); - Register SrcReg2 = I.getOperand(4).getReg(); - Register SrcReg3 = I.getOperand(6).getReg(); +MachineInstrBuilder GCNPreRAOptimizationsImpl::createUnpackedMI(MachineBasicBlock &MBB, MachineInstr &I, const DebugLoc &DL, uint16_t UnpackedOpcode, bool isHiBits, bool isFMA) { MachineOperand &DstMO = I.getOperand(0); MachineOperand &SrcMO1 = I.getOperand(2); MachineOperand &SrcMO2 = I.getOperand(4); - MachineOperand &SrcMO3 = I.getOperand(6); - - const DebugLoc &DL = I.getDebugLoc(); - const TargetRegisterClass *DstRC = MRI.getRegClass(I.getOperand(0).getReg()); - const TargetRegisterClass *Src0RC = MRI.getRegClass(I.getOperand(2).getReg()); - const TargetRegisterClass *Src1RC = MRI.getRegClass(I.getOperand(4).getReg()); - const TargetRegisterClass *Src2RC = MRI.getRegClass(I.getOperand(6).getReg()); - - bool IsVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID); - - // insertUnpackedF32MI(I, DstMO, SrcMO1, SrcMO2, SrcMO1, SrcMO2, IsVReg64); - unsigned SrcSubIdx1 = - TRI->composeSubRegIndices(SrcMO1.getSubReg(), AMDGPU::sub0); - unsigned SrcSubIdx2 = - TRI->composeSubRegIndices(SrcMO2.getSubReg(), AMDGPU::sub0); - unsigned SrcSubIdx3 = - TRI->composeSubRegIndices(SrcMO3.getSubReg(), AMDGPU::sub0); - unsigned DestSubIdx = - TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub0); - - const MCInstrDesc InstrDesc = I.getDesc(); + Register DstReg = DstMO.getReg(); + Register SrcReg1 = SrcMO1.getReg(); + Register SrcReg2 = SrcMO2.getReg(); + const TargetRegisterClass *DstRC = MRI->getRegClass(DstMO.getReg()); + unsigned DestSubIdx = isHiBits ? TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub1) : TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub0); int ClampIdx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::clamp); int64_t ClampVal = I.getOperand(ClampIdx).getImm(); @@ -535,153 +479,83 @@ void GCNPreRAOptimizationsImpl::processFMAF32Unpacking(MachineInstr &I) { AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src0_modifiers); int Src1_modifiers_Idx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src1_modifiers); - int Src2_modifiers_Idx = - AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src2_modifiers); + unsigned Src0_Mods = I.getOperand(Src0_modifiers_Idx).getImm(); unsigned Src1_Mods = I.getOperand(Src1_modifiers_Idx).getImm(); - unsigned Src2_Mods = I.getOperand(Src2_modifiers_Idx).getImm(); - // Packed instructions (VOP3P) do not support abs. It is okay to ignore them. - unsigned Lo_src0_mods = 0; - unsigned Lo_src1_mods = 0; - unsigned Lo_src2_mods = 0; - uint16_t UnpackedOpcode = mapToUnpackedOpcode(I); - if (UnpackedOpcode == std::numeric_limits::max()) - return; - - MachineInstrBuilder Op0L_Op1L = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode)); - Op0L_Op1L.addDef(DstReg, 0, DestSubIdx); // vdst - if (Src0_Mods & SISrcMods::NEG) { - Lo_src0_mods |= SISrcMods::NEG; - } - Op0L_Op1L.addImm(Lo_src0_mods); // src0_modifiers - if (Src0_Mods & SISrcMods::OP_SEL_0) { + unsigned New_Src0_Mods = 0; + unsigned New_Src1_Mods = 0; + + unsigned NegModifier = isHiBits ? SISrcMods::NEG_HI : SISrcMods::NEG; + unsigned OpSelModifier = isHiBits ? SISrcMods::OP_SEL_1 : SISrcMods::OP_SEL_0; + + MachineInstrBuilder NewMI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode)); + NewMI.addDef(DstReg, 0, DestSubIdx); // vdst + if (Src0_Mods & NegModifier) { + New_Src0_Mods |= SISrcMods::NEG; + } + NewMI.addImm(New_Src0_Mods); // src0_modifiers + + if (Src0_Mods & OpSelModifier) { unsigned Src0SubIdx = TRI->composeSubRegIndices(SrcMO1.getSubReg(), AMDGPU::sub1); - Op0L_Op1L.addReg(SrcMO1.getReg(), 0, Src0SubIdx); // src0 + NewMI.addReg(SrcMO1.getReg(), 0, Src0SubIdx); // src0 } else { unsigned Src0SubIdx = TRI->composeSubRegIndices(SrcMO1.getSubReg(), AMDGPU::sub0); // if op_sel == 0, select register 0 of reg:sub0_sub1 - Op0L_Op1L.addReg(SrcMO1.getReg(), 0, Src0SubIdx); + NewMI.addReg(SrcMO1.getReg(), 0, Src0SubIdx); } - if (Src1_Mods & SISrcMods::NEG) { - Lo_src1_mods |= SISrcMods::NEG; + if (Src1_Mods & NegModifier) { + New_Src1_Mods |= SISrcMods::NEG; } - Op0L_Op1L.addImm(Lo_src1_mods); // src1_modifiers - if (Src1_Mods & SISrcMods::OP_SEL_0) { + NewMI.addImm(New_Src1_Mods); // src1_modifiers + if (Src1_Mods & OpSelModifier) { unsigned Src1SubIdx = TRI->composeSubRegIndices(SrcMO2.getSubReg(), AMDGPU::sub1); - Op0L_Op1L.addReg(SrcMO2.getReg(), 0, Src1SubIdx); // src0 + NewMI.addReg(SrcMO2.getReg(), 0, Src1SubIdx); // src0 } else { - unsigned Src1SubIdx = - TRI->composeSubRegIndices(SrcMO2.getSubReg(), AMDGPU::sub0); - Op0L_Op1L.addReg(SrcMO2.getReg(), 0, - Src1SubIdx); // src0 //if op_sel_hi == 0, select register 0 - // of reg:sub0_sub1 - } - - if (Src2_Mods & SISrcMods::NEG) { - Lo_src2_mods |= SISrcMods::NEG; - } - Op0L_Op1L.addImm(Lo_src2_mods); // src2_modifiers - if (Src2_Mods & SISrcMods::OP_SEL_0) { - unsigned Src2SubIdx = - TRI->composeSubRegIndices(SrcMO3.getSubReg(), AMDGPU::sub1); - Op0L_Op1L.addReg(SrcMO3.getReg(), 0, Src2SubIdx); - } else { - unsigned Src2SubIdx = - TRI->composeSubRegIndices(SrcMO3.getSubReg(), AMDGPU::sub0); // if op_sel_hi == 0, select register 0 of reg:sub0_sub1 - Op0L_Op1L.addReg(SrcMO3.getReg(), 0, Src2SubIdx); - } - Op0L_Op1L.addImm(ClampVal); // clamp - // packed instructions do not support output modifiers. safe to assign them 0 - // for this use case - Op0L_Op1L.addImm(0); // omod - - if (I.getOperand(0).isUndef()) { - Op0L_Op1L->getOperand(0).setIsUndef(); - } - - LIS->InsertMachineInstrInMaps(*Op0L_Op1L); - - SrcSubIdx1 = TRI->composeSubRegIndices(SrcMO1.getSubReg(), AMDGPU::sub1); - SrcSubIdx2 = TRI->composeSubRegIndices(SrcMO2.getSubReg(), AMDGPU::sub1); - SrcSubIdx3 = TRI->composeSubRegIndices(SrcMO3.getSubReg(), AMDGPU::sub1); - DestSubIdx = TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub1); - - // Packed instructions (VOP3P) do not support abs. It is safe to ignore them. - unsigned Hi_src0_mods = 0; - unsigned Hi_src1_mods = 0; - unsigned Hi_src2_mods = 0; - - MachineInstrBuilder Op0H_Op1H = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode)); - Op0H_Op1H.addDef(DstReg, 0, DestSubIdx); // vdst - if (Src0_Mods & SISrcMods::NEG_HI) { - Hi_src0_mods |= SISrcMods::NEG_HI; - } - Op0H_Op1H.addImm(Hi_src0_mods); // src0_modifiers - if (Src0_Mods & SISrcMods::OP_SEL_1) { - unsigned Src0SubIdx = - TRI->composeSubRegIndices(SrcMO1.getSubReg(), AMDGPU::sub1); - Op0H_Op1H.addReg(SrcMO1.getReg(), 0, Src0SubIdx); // src0 - } else { - unsigned Src0SubIdx = - TRI->composeSubRegIndices(SrcMO1.getSubReg(), AMDGPU::sub0); - // src0 //if op_sel_hi == 0, select register 0 of reg:sub0_sub1 - Op0H_Op1H.addReg(SrcMO1.getReg(), 0, Src0SubIdx); - } - - if (Src1_Mods & SISrcMods::NEG_HI) { - Hi_src1_mods |= SISrcMods::NEG_HI; - } - Op0H_Op1H.addImm(Hi_src1_mods); // src0_modifiers - - if (Src1_Mods & SISrcMods::OP_SEL_1) { - unsigned Src1SubIdx = - TRI->composeSubRegIndices(SrcMO2.getSubReg(), AMDGPU::sub1); - Op0H_Op1H.addReg(SrcMO2.getReg(), 0, Src1SubIdx); // src0 - } else { - Op0H_Op1H.addImm(Hi_src1_mods); // src1_modifiers unsigned Src1SubIdx = TRI->composeSubRegIndices(SrcMO2.getSubReg(), AMDGPU::sub0); - // if op_sel_hi == 0, select register 0 of reg:sub0_sub1 - Op0H_Op1H.addReg(SrcMO2.getReg(), 0, Src1SubIdx); - } - - if (Src2_Mods & SISrcMods::NEG_HI) { - Hi_src2_mods |= SISrcMods::NEG_HI; - } - Op0H_Op1H.addImm(Hi_src2_mods); // src2_modifiers - - if (Src2_Mods & SISrcMods::OP_SEL_1) { - unsigned Src2SubIdx = - TRI->composeSubRegIndices(SrcMO3.getSubReg(), AMDGPU::sub1); - Op0H_Op1H.addReg(SrcMO3.getReg(), 0, Src2SubIdx); // src0 - } else { - Op0H_Op1H.addImm(Hi_src2_mods); // src2_modifiers - unsigned Src2SubIdx = - TRI->composeSubRegIndices(SrcMO2.getSubReg(), AMDGPU::sub0); - // src0 //if op_sel_hi == 0, select register 0 of reg:sub0_sub1 - Op0H_Op1H.addReg(SrcMO2.getReg(), 0, Src2SubIdx); + NewMI.addReg(SrcMO2.getReg(), 0, + Src1SubIdx); + } + + if (isFMA) { + MachineOperand &SrcMO3 = I.getOperand(6); + Register SrcReg3 = SrcMO3.getReg(); + int Src2_modifiers_Idx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src2_modifiers); + unsigned Src2_Mods = I.getOperand(Src2_modifiers_Idx).getImm(); + unsigned New_Src2_Mods = 0; + //If NEG or NEG_HI is true, we need to negate the corresponding 32 bit lane. + // This is also true for NEG_HI as it shares the same bit position with ABS. + // But packed instructions do not support ABS. Therefore, NEG_HI must + // be translated to NEG source modifier for the higher 32 bits. + // Unpacked VOP3 instructions do support ABS, therefore we need to explicitly add + // the NEG modifier if present in the packed instruction + if (Src2_Mods & NegModifier) { + // New_Src2_Mods |= NegModifier; + New_Src2_Mods |= SISrcMods::NEG; + } + NewMI.addImm(New_Src2_Mods); // src2_modifiers + if (Src2_Mods & OpSelModifier) { + unsigned Src2SubIdx = + TRI->composeSubRegIndices(SrcMO3.getSubReg(), AMDGPU::sub1); + NewMI.addReg(SrcMO3.getReg(), 0, Src2SubIdx); + } else { + unsigned Src2SubIdx = + TRI->composeSubRegIndices(SrcMO3.getSubReg(), AMDGPU::sub0); + // if op_sel_hi == 0, select register 0 of reg:sub0_sub1 + NewMI.addReg(SrcMO3.getReg(), 0, Src2SubIdx); + } } - Op0H_Op1H.addImm(ClampVal); // clamp + NewMI.addImm(ClampVal); // clamp // packed instructions do not support output modifiers. safe to assign them 0 // for this use case - Op0H_Op1H.addImm(0); // omod - LIS->InsertMachineInstrInMaps(*Op0H_Op1H); - - if (I.getFlag(MachineInstr::MIFlag::NoFPExcept)) { - Op0L_Op1L->setFlag(MachineInstr::MIFlag::NoFPExcept); - Op0H_Op1H->setFlag(MachineInstr::MIFlag::NoFPExcept); - } - LIS->RemoveMachineInstrFromMaps(I); - I.eraseFromParent(); - LIS->removeInterval(DstReg); - LIS->createAndComputeVirtRegInterval(DstReg); - return; + NewMI.addImm(0); // omod + return NewMI; } void GCNPreRAOptimizationsImpl::processF32Unpacking(MachineInstr &I) { @@ -690,20 +564,13 @@ void GCNPreRAOptimizationsImpl::processF32Unpacking(MachineInstr &I) { return; } MachineBasicBlock &MBB = *I.getParent(); - MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - MachineFunction &MF = *MBB.getParent(); - - Register DstReg = I.getOperand(0).getReg(); - Register SrcReg1 = I.getOperand(2).getReg(); - Register SrcReg2 = I.getOperand(4).getReg(); + MachineOperand &DstMO = I.getOperand(0); MachineOperand &SrcMO1 = I.getOperand(2); MachineOperand &SrcMO2 = I.getOperand(4); const DebugLoc &DL = I.getDebugLoc(); - const TargetRegisterClass *DstRC = MRI.getRegClass(I.getOperand(0).getReg()); - const TargetRegisterClass *Src0RC = MRI.getRegClass(I.getOperand(2).getReg()); - const TargetRegisterClass *Src1RC = MRI.getRegClass(I.getOperand(4).getReg()); + const TargetRegisterClass *DstRC = MRI->getRegClass(I.getOperand(0).getReg()); bool IsVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID); insertUnpackedF32MI(I, DstMO, SrcMO1, SrcMO2, SrcMO1, SrcMO2, IsVReg64); @@ -1000,6 +867,8 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) { // co-issue unpacked instructions with MFMA for (MachineBasicBlock &MBB : MF) { SetVector InstrsToUnpack; + SetVector WriteOperands; + SetVector ReadOperands; IsF16MaskSet = false; uint16_t NumMFMACycles = 0; auto SchedModel = TII->getSchedModel(); @@ -1050,5 +919,6 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) { } } } + LIS->reanalyze(MF); return Changed; } \ No newline at end of file From 1bcbebaf648b3a551d1429e1206a8264fbc14d5c Mon Sep 17 00:00:00 2001 From: Akash Dutta Date: Tue, 2 Sep 2025 16:31:40 -0500 Subject: [PATCH 14/16] remove f16 support && add dependency checks --- .../Target/AMDGPU/GCNPreRAOptimizations.cpp | 417 +++--------------- ...npack-non-coissue-insts-post-scheduler.mir | 55 --- 2 files changed, 68 insertions(+), 404 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp index 3bbed5a4d7e8a..e2d1fc073bd73 100644 --- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp +++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp @@ -34,6 +34,7 @@ /// in machine schedules and is expected to improve performance. Only those /// packed instructions are unpacked that are overlapped by the MFMA latency. /// Rest should remain untouched. +/// TODO: Add support for F16 packed instructions //===----------------------------------------------------------------------===// #include "GCNPreRAOptimizations.h" @@ -75,15 +76,13 @@ class GCNPreRAOptimizationsImpl { MachineOperand &LoSrcMO1, MachineOperand &LoSrcMO2, MachineOperand &HiSrcMO1, MachineOperand &HiSrcMO2, bool isVreg_64); - void processF16Unpacking(MachineInstr &I, uint16_t AvailableBudget); void processFMAF32Unpacking(MachineInstr &I); - MachineInstrBuilder createUnpackedMI(MachineBasicBlock &MBB, MachineInstr &I, const DebugLoc &DL, uint16_t UnpackedOpcode, bool isHiBits, bool isFMA); - bool hasReadWriteDependencies(const MachineInstr &PredMI, const MachineInstr &SuccMI); - - bool IsF16MaskSet; - Register MaskLo; // mask to extract lower 16 bits for F16 packed instructions - Register - ShiftAmt; // mask to extract higher 16 bits from F16 packed instructions + MachineInstrBuilder createUnpackedMI(MachineBasicBlock &MBB, MachineInstr &I, + const DebugLoc &DL, + uint16_t UnpackedOpcode, bool isHiBits, + bool isFMA); + bool hasReadWriteDependencies(const MachineInstr &PredMI, + const MachineInstr &SuccMI); public: GCNPreRAOptimizationsImpl(LiveIntervals *LS) : LIS(LS) {} @@ -262,11 +261,10 @@ bool GCNPreRAOptimizationsImpl::isUnpackingSupportedInstr( switch (Opcode) { case AMDGPU::V_PK_ADD_F32: case AMDGPU::V_PK_MUL_F32: - case AMDGPU::V_PK_MUL_F16: - case AMDGPU::V_PK_ADD_F16: return (MI.getOperand(2).isReg() && MI.getOperand(4).isReg()); case AMDGPU::V_PK_FMA_F32: - return (MI.getOperand(2).isReg() && MI.getOperand(4).isReg() && MI.getOperand(6).isReg()); + return (MI.getOperand(2).isReg() && MI.getOperand(4).isReg() && + MI.getOperand(6).isReg()); default: return false; } @@ -282,10 +280,6 @@ uint16_t GCNPreRAOptimizationsImpl::mapToUnpackedOpcode(MachineInstr &I) { return AMDGPU::V_ADD_F32_e64; case AMDGPU::V_PK_MUL_F32: return AMDGPU::V_MUL_F32_e64; - case AMDGPU::V_PK_ADD_F16: - return AMDGPU::V_ADD_F16_e64; - case AMDGPU::V_PK_MUL_F16: - return AMDGPU::V_MUL_F16_e64; case AMDGPU::V_PK_FMA_F32: return AMDGPU::V_FMA_F32_e64; default: @@ -293,15 +287,20 @@ uint16_t GCNPreRAOptimizationsImpl::mapToUnpackedOpcode(MachineInstr &I) { } } -bool GCNPreRAOptimizationsImpl::hasReadWriteDependencies(const MachineInstr &PredMI, const MachineInstr &SuccMI) { - for (const MachineOperand &Pred_Ops: PredMI.operands()) { - if (!Pred_Ops.isReg() || !Pred_Ops.isDef()) continue; +bool GCNPreRAOptimizationsImpl::hasReadWriteDependencies( + const MachineInstr &PredMI, const MachineInstr &SuccMI) { + for (const MachineOperand &Pred_Ops : PredMI.operands()) { + if (!Pred_Ops.isReg() || !Pred_Ops.isDef()) + continue; Register Pred_Reg = Pred_Ops.getReg(); - if (!Pred_Reg.isValid()) continue; - for (const MachineOperand &Succ_Ops: SuccMI.operands()) { - if (!Succ_Ops.isReg() || !Succ_Ops.isDef()) continue; + if (!Pred_Reg.isValid()) + continue; + for (const MachineOperand &Succ_Ops : SuccMI.operands()) { + if (!Succ_Ops.isReg() || !Succ_Ops.isDef()) + continue; Register Succ_Reg = Succ_Ops.getReg(); - if (!Succ_Reg.isValid()) continue; + if (!Succ_Reg.isValid()) + continue; if ((Pred_Reg == Succ_Reg) || TRI->regsOverlap(Pred_Reg, Succ_Reg)) { return true; } @@ -315,9 +314,7 @@ bool GCNPreRAOptimizationsImpl::createListOfPackedInstr( auto *BB = BeginMI.getParent(); auto *MF = BB->getParent(); int NumInst = 0; - auto E = BB->end(); - int TotalCyclesBetweenCandidates = 0; auto SchedModel = TII->getSchedModel(); for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) { @@ -329,56 +326,25 @@ bool GCNPreRAOptimizationsImpl::createListOfPackedInstr( if (Instr.isMetaInstruction()) continue; - if (Instr.isTerminator()) return false; - if (TotalCyclesBetweenCandidates > NumMFMACycles) return false; - if ((isUnpackingSupportedInstr(Instr)) && TII->isNeverCoissue(Instr)) { - if (hasReadWriteDependencies(BeginMI, Instr)){ - dbgs() << "## here\n"; - } - if ((Instr.getOpcode() == AMDGPU::V_PK_MUL_F16) || - (Instr.getOpcode() == AMDGPU::V_PK_ADD_F16)) { - // unpacking packed F16 instructions requires multiple instructions. - // Instructions are issued to extract lower and higher bits for each - // operand Instructions are then issued for 2 unpacked instructions, and - // additional instructions to put them back into the original - // destination register The following sequence of instructions are - // issued - - // The next two are needed to move masks into vgprs. Ideally, immediates - // should be used. However, if one of the source operands are - // sgpr/sregs, then immediates are not allowed. Hence, the need to move - // these into vgprs - - // vgpr_32 = V_MOV_B32_e32 65535 - // vgpr_32 = V_MOV_B32_e32 16 - - // vgpr_32 = V_AND_B32_e32 sub1:sreg_64, vgpr_32 - // vgpr_32 = V_LSHRREV_B32_e64 vgpr_32, sub1:sreg_64 - // vgpr_32 = V_AND_B32_e32 vgpr_32, vgpr_32 - // vgpr_32 = V_LSHRREV_B32_e64 vgpr_32, vgpr_32 - // vgpr_32 = V_MUL_F16_e64 0, killed vgpr_32, 0, killed vgpr_32, 0, 0 - // vgpr_32 = V_MUL_F16_e64 0, killed vgpr_32, 0, killed vgpr_32, 0, 0 - // vgpr_32 = V_LSHLREV_B32_e64 vgpr_32, vgpr_32 - // dst_reg = V_OR_B32_e64 vgpr_32, vgpr_32 - - // we need to issue the MOV instructions above only once. Once these are - // issued, the IsF16MaskSet flag is set subsequent unpacking only needs - // to issue the remaining instructions The number of latency cycles for - // each instruction above is 1. It's hard coded into the code to reduce - // code complexity. - if (IsF16MaskSet) - TotalCyclesBetweenCandidates += 7; - else - TotalCyclesBetweenCandidates += 9; - } else - TotalCyclesBetweenCandidates += 1; - - if (!(TotalCyclesBetweenCandidates > NumMFMACycles)) + if (hasReadWriteDependencies(BeginMI, Instr)) + return false; + + // if it is a packed instruction, we should subtract it's latency from the + // overall latency calculation here, because the packed instruction will + // be removed and replaced by 2 unpacked instructions + TotalCyclesBetweenCandidates -= + SchedModel.getWriteProcResBegin(InstrSchedClassDesc)->ReleaseAtCycle; + // We're adding 2 to account for the extra latency added by unpacking into + // 2 instructions. At the time of writing, the considered unpacked + // instructions have latency of 1. + // TODO: improve latency handling of possible inserted instructions + TotalCyclesBetweenCandidates += 2; + if (!(TotalCyclesBetweenCandidates >= NumMFMACycles)) InstrsToUnpack.insert(&Instr); } } @@ -390,7 +356,7 @@ void GCNPreRAOptimizationsImpl::insertUnpackedF32MI( MachineOperand &LoSrcMO2, MachineOperand &HiSrcMO1, MachineOperand &HiSrcMO2, bool IsVreg_64) { - MachineBasicBlock &MBB = *I.getParent(); + MachineBasicBlock &MBB = *I.getParent(); const DebugLoc &DL = I.getDebugLoc(); Register DstReg = DstMO.getReg(); @@ -398,15 +364,17 @@ void GCNPreRAOptimizationsImpl::insertUnpackedF32MI( if (UnpackedOpcode == std::numeric_limits::max()) return; - MachineInstrBuilder Op0L_Op1L = createUnpackedMI(MBB, I, DL, UnpackedOpcode, false, false); + MachineInstrBuilder Op0L_Op1L = + createUnpackedMI(MBB, I, DL, UnpackedOpcode, false, false); if (IsVreg_64) { Op0L_Op1L->getOperand(0).setIsUndef(); } else if (DstMO.isUndef()) { Op0L_Op1L->getOperand(0).setIsUndef(); } LIS->InsertMachineInstrInMaps(*Op0L_Op1L); - - MachineInstrBuilder Op0H_Op1H = createUnpackedMI(MBB, I, DL, UnpackedOpcode, true, false); + + MachineInstrBuilder Op0H_Op1H = + createUnpackedMI(MBB, I, DL, UnpackedOpcode, true, false); LIS->InsertMachineInstrInMaps(*Op0H_Op1H); if (I.getFlag(MachineInstr::MIFlag::NoFPExcept)) { @@ -435,8 +403,9 @@ void GCNPreRAOptimizationsImpl::processFMAF32Unpacking(MachineInstr &I) { uint16_t UnpackedOpcode = mapToUnpackedOpcode(I); if (UnpackedOpcode == std::numeric_limits::max()) return; - - MachineInstrBuilder Op0L_Op1L = createUnpackedMI(MBB, I, DL, UnpackedOpcode, false, true); + + MachineInstrBuilder Op0L_Op1L = + createUnpackedMI(MBB, I, DL, UnpackedOpcode, false, true); if (IsVReg64) Op0L_Op1L->getOperand(0).setIsUndef(); else if (I.getOperand(0).isUndef()) { @@ -444,7 +413,8 @@ void GCNPreRAOptimizationsImpl::processFMAF32Unpacking(MachineInstr &I) { } LIS->InsertMachineInstrInMaps(*Op0L_Op1L); - MachineInstrBuilder Op0H_Op1H = createUnpackedMI(MBB, I, DL, UnpackedOpcode, true, true); + MachineInstrBuilder Op0H_Op1H = + createUnpackedMI(MBB, I, DL, UnpackedOpcode, true, true); LIS->InsertMachineInstrInMaps(*Op0H_Op1H); if (I.getFlag(MachineInstr::MIFlag::NoFPExcept)) { @@ -463,7 +433,9 @@ void GCNPreRAOptimizationsImpl::processFMAF32Unpacking(MachineInstr &I) { return; } -MachineInstrBuilder GCNPreRAOptimizationsImpl::createUnpackedMI(MachineBasicBlock &MBB, MachineInstr &I, const DebugLoc &DL, uint16_t UnpackedOpcode, bool isHiBits, bool isFMA) { +MachineInstrBuilder GCNPreRAOptimizationsImpl::createUnpackedMI( + MachineBasicBlock &MBB, MachineInstr &I, const DebugLoc &DL, + uint16_t UnpackedOpcode, bool isHiBits, bool isFMA) { MachineOperand &DstMO = I.getOperand(0); MachineOperand &SrcMO1 = I.getOperand(2); MachineOperand &SrcMO2 = I.getOperand(4); @@ -471,7 +443,9 @@ MachineInstrBuilder GCNPreRAOptimizationsImpl::createUnpackedMI(MachineBasicBloc Register SrcReg1 = SrcMO1.getReg(); Register SrcReg2 = SrcMO2.getReg(); const TargetRegisterClass *DstRC = MRI->getRegClass(DstMO.getReg()); - unsigned DestSubIdx = isHiBits ? TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub1) : TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub0); + unsigned DestSubIdx = + isHiBits ? TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub1) + : TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub0); int ClampIdx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::clamp); int64_t ClampVal = I.getOperand(ClampIdx).getImm(); @@ -479,16 +453,16 @@ MachineInstrBuilder GCNPreRAOptimizationsImpl::createUnpackedMI(MachineBasicBloc AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src0_modifiers); int Src1_modifiers_Idx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src1_modifiers); - + unsigned Src0_Mods = I.getOperand(Src0_modifiers_Idx).getImm(); unsigned Src1_Mods = I.getOperand(Src1_modifiers_Idx).getImm(); // Packed instructions (VOP3P) do not support abs. It is okay to ignore them. unsigned New_Src0_Mods = 0; unsigned New_Src1_Mods = 0; - + unsigned NegModifier = isHiBits ? SISrcMods::NEG_HI : SISrcMods::NEG; unsigned OpSelModifier = isHiBits ? SISrcMods::OP_SEL_1 : SISrcMods::OP_SEL_0; - + MachineInstrBuilder NewMI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode)); NewMI.addDef(DstReg, 0, DestSubIdx); // vdst if (Src0_Mods & NegModifier) { @@ -519,25 +493,26 @@ MachineInstrBuilder GCNPreRAOptimizationsImpl::createUnpackedMI(MachineBasicBloc // if op_sel_hi == 0, select register 0 of reg:sub0_sub1 unsigned Src1SubIdx = TRI->composeSubRegIndices(SrcMO2.getSubReg(), AMDGPU::sub0); - NewMI.addReg(SrcMO2.getReg(), 0, - Src1SubIdx); + NewMI.addReg(SrcMO2.getReg(), 0, Src1SubIdx); } if (isFMA) { MachineOperand &SrcMO3 = I.getOperand(6); Register SrcReg3 = SrcMO3.getReg(); - int Src2_modifiers_Idx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src2_modifiers); + int Src2_modifiers_Idx = AMDGPU::getNamedOperandIdx( + I.getOpcode(), AMDGPU::OpName::src2_modifiers); unsigned Src2_Mods = I.getOperand(Src2_modifiers_Idx).getImm(); unsigned New_Src2_Mods = 0; - //If NEG or NEG_HI is true, we need to negate the corresponding 32 bit lane. - // This is also true for NEG_HI as it shares the same bit position with ABS. - // But packed instructions do not support ABS. Therefore, NEG_HI must - // be translated to NEG source modifier for the higher 32 bits. - // Unpacked VOP3 instructions do support ABS, therefore we need to explicitly add - // the NEG modifier if present in the packed instruction + // If NEG or NEG_HI is true, we need to negate the corresponding 32 bit + // lane. + // This is also true for NEG_HI as it shares the same bit position with + // ABS. But packed instructions do not support ABS. Therefore, NEG_HI must + // be translated to NEG source modifier for the higher 32 bits. + // Unpacked VOP3 instructions do support ABS, therefore we need to + // explicitly add the NEG modifier if present in the packed instruction if (Src2_Mods & NegModifier) { // New_Src2_Mods |= NegModifier; - New_Src2_Mods |= SISrcMods::NEG; + New_Src2_Mods |= SISrcMods::NEG; } NewMI.addImm(New_Src2_Mods); // src2_modifiers if (Src2_Mods & OpSelModifier) { @@ -564,7 +539,7 @@ void GCNPreRAOptimizationsImpl::processF32Unpacking(MachineInstr &I) { return; } MachineBasicBlock &MBB = *I.getParent(); - + MachineOperand &DstMO = I.getOperand(0); MachineOperand &SrcMO1 = I.getOperand(2); MachineOperand &SrcMO2 = I.getOperand(4); @@ -577,256 +552,6 @@ void GCNPreRAOptimizationsImpl::processF32Unpacking(MachineInstr &I) { return; } -void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I, - uint16_t AvailableBudget) { - MachineBasicBlock &MBB = *I.getParent(); - MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - MachineOperand &DstMO = I.getOperand(0); - MachineOperand &SrcMO0 = I.getOperand(2); - MachineOperand &SrcMO1 = I.getOperand(4); - Register DstReg = DstMO.getReg(); - Register SrcReg0 = SrcMO0.getReg(); - Register SrcReg1 = SrcMO1.getReg(); - const DebugLoc &DL = I.getDebugLoc(); - - const TargetRegisterClass *RC = &AMDGPU::VGPR_32RegClass; - auto SchedModel = TII->getSchedModel(); - - auto BuildImm = [&](uint32_t Val) -> std::pair { - Register ImmReg = MRI.createVirtualRegister(RC); - auto NewMI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), ImmReg) - .addImm(Val); - LIS->InsertMachineInstrInMaps(*NewMI); - const MCSchedClassDesc *SchedClassDesc = - SchedModel.resolveSchedClass(NewMI); - uint16_t LatencyCycles = - SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle; - return {ImmReg, LatencyCycles}; - }; - - if (!IsF16MaskSet) { - std::pair RegAndLatency = BuildImm(0x0000FFFF); - MaskLo = RegAndLatency.first; // mask for lower 16 bits - RegAndLatency = BuildImm(16); - ShiftAmt = RegAndLatency.first; // mask for higher 16 bits - IsF16MaskSet = true; - } - - Register Src0_Lo = MRI.createVirtualRegister(RC); - Register Src1_Lo = MRI.createVirtualRegister(RC); - Register Src0_Hi = MRI.createVirtualRegister(RC); - Register Src1_Hi = MRI.createVirtualRegister(RC); - - unsigned SubRegID = 0; - if (SrcMO0.getSubReg()) - SubRegID = SrcMO0.getSubReg(); - - int Src0_modifiers_Idx = - AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src0_modifiers); - int Src1_modifiers_Idx = - AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src1_modifiers); - unsigned Src0_Mods = I.getOperand(Src0_modifiers_Idx).getImm(); - unsigned Src1_Mods = I.getOperand(Src1_modifiers_Idx).getImm(); - int ClampIdx = - AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::clamp); - int64_t ClampVal = I.getOperand(ClampIdx).getImm(); - - // handle op_sel for src0 - if (Src0_Mods & SISrcMods::OP_SEL_0) { - // if op_sel is set, select higher 16 bits and copy into lower 16 bits of - // new vgpr - MachineInstrBuilder LoInput0_MI = - BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), Src0_Lo) - .addReg(ShiftAmt); - if (SubRegID) - LoInput0_MI.addReg(SrcReg0, 0, SubRegID); - else - LoInput0_MI.addReg(SrcReg0); - LIS->InsertMachineInstrInMaps(*LoInput0_MI); - } else { - // if op_sel is not set, select lower 16 bits and copy into lower 16 bits of - // new vgpr - MachineInstrBuilder LoInput0_MI = - BuildMI(MBB, I, DL, TII->get(AMDGPU::V_AND_B32_e32), Src0_Lo); - if (SubRegID) - LoInput0_MI.addReg(SrcReg0, 0, SubRegID); - else - LoInput0_MI.addReg(SrcReg0); - LoInput0_MI.addReg(MaskLo); - LIS->InsertMachineInstrInMaps(*LoInput0_MI); - } - - // handle op_sel_hi for src0 - if (Src0_Mods & SISrcMods::OP_SEL_1) { - // if op_sel_hi is set, select higher 16 bits and copy into lower 16 bits of - // new vgpr - MachineInstrBuilder HiInput0_MI = - BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), Src0_Hi) - .addReg(ShiftAmt); - if (SubRegID) - HiInput0_MI.addReg(SrcReg0, 0, SubRegID); - else - HiInput0_MI.addReg(SrcReg0); - LIS->InsertMachineInstrInMaps(*HiInput0_MI); - } else { - // if op_sel_hi is not set, select lower 16 bits and copy into lower 16 bits - // of new vgpr - MachineInstrBuilder HiInput0_MI = - BuildMI(MBB, I, DL, TII->get(AMDGPU::V_AND_B32_e32), Src0_Hi); - if (SubRegID) - HiInput0_MI.addReg(SrcReg0, 0, SubRegID); - else - HiInput0_MI.addReg(SrcReg0); - HiInput0_MI.addReg(MaskLo); - LIS->InsertMachineInstrInMaps(*HiInput0_MI); - } - - SubRegID = 0; - if (SrcMO0.getSubReg()) - SubRegID = SrcMO1.getSubReg(); - // handle op_sel for src1 - if (Src1_Mods & SISrcMods::OP_SEL_0) { - // if op_sel is set, select higher 16 bits and copy into lower 16 bits of - // new vgpr - MachineInstrBuilder LoInput1_MI = - BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), Src1_Lo) - .addReg(ShiftAmt); - if (SubRegID) - LoInput1_MI.addReg(SrcReg1, 0, SubRegID); - else - LoInput1_MI.addReg(SrcReg1); - LIS->InsertMachineInstrInMaps(*LoInput1_MI); - } else { - // if op_sel is not set, select lower 16 bits and copy into lower 16 bits of - // new vgpr - MachineInstrBuilder LoInput1_MI = - BuildMI(MBB, I, DL, TII->get(AMDGPU::V_AND_B32_e32), Src1_Lo); - if (SubRegID) - LoInput1_MI.addReg(SrcReg1, 0, SubRegID); - else - LoInput1_MI.addReg(SrcReg1); - LoInput1_MI.addReg(MaskLo); - LIS->InsertMachineInstrInMaps(*LoInput1_MI); - } - - // handle op_sel_hi for src1 - if (Src1_Mods & SISrcMods::OP_SEL_1) { - // if op_sel_hi is set, select higher 16 bits and copy into lower 16 bits of - // new vgpr - MachineInstrBuilder HiInput1_MI = - BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), Src1_Hi) - .addReg(ShiftAmt); - if (SubRegID) - HiInput1_MI.addReg(SrcReg1, 0, SubRegID); - else - HiInput1_MI.addReg(SrcReg1); - LIS->InsertMachineInstrInMaps(*HiInput1_MI); - } else { - // if op_sel_hi is not set, select lower 16 bits and copy into lower 16 bits - // of new vgpr - MachineInstrBuilder HiInput1_MI = - BuildMI(MBB, I, DL, TII->get(AMDGPU::V_AND_B32_e32), Src1_Hi); - if (SubRegID) - HiInput1_MI.addReg(SrcReg1, 0, SubRegID); - else - HiInput1_MI.addReg(SrcReg1); - HiInput1_MI.addReg(MaskLo); - LIS->InsertMachineInstrInMaps(*HiInput1_MI); - } - - Register LoMul = MRI.createVirtualRegister(RC); - Register HiMul = MRI.createVirtualRegister(RC); - - unsigned Lo_src0_mods = 0; - unsigned Lo_src1_mods = 0; - uint16_t UnpackedOpcode = mapToUnpackedOpcode(I); - - if (UnpackedOpcode == std::numeric_limits::max()) - return; - // Unpacked instructions - MachineInstrBuilder LoMul_MI = - BuildMI(MBB, I, DL, TII->get(UnpackedOpcode), LoMul); - - if (Src0_Mods & SISrcMods::NEG) - Lo_src0_mods |= SISrcMods::NEG; - - LoMul_MI.addImm(Lo_src0_mods); // src0_modifiers - LoMul_MI.addReg(Src0_Lo, RegState::Kill); // src0 - - if (Src1_Mods & SISrcMods::NEG) - Lo_src1_mods |= SISrcMods::NEG; - - LoMul_MI.addImm(Lo_src1_mods); // src1_modifiers - LoMul_MI.addReg(Src1_Lo, RegState::Kill); // src1 - LoMul_MI.addImm(ClampVal); // clamp - // packed instructions do not support output modifiers. safe to assign them 0 - // for this use case - LoMul_MI.addImm(0); // omod - - // unpacked instruction with VOP3 encoding for Hi bits - unsigned Hi_src0_mods = 0; - unsigned Hi_src1_mods = 0; - - MachineInstrBuilder HiMul_MI = - BuildMI(MBB, I, DL, TII->get(UnpackedOpcode), HiMul); - if (Src0_Mods & SISrcMods::NEG_HI) - Hi_src0_mods |= SISrcMods::NEG_HI; - - HiMul_MI.addImm(Hi_src0_mods); // src0_modifiers - HiMul_MI.addReg(Src0_Hi, - RegState::Kill); // select higher 16 bits if op_sel_hi is set - - if (Src1_Mods & SISrcMods::NEG_HI) - Hi_src1_mods |= SISrcMods::NEG_HI; - - HiMul_MI.addImm(Hi_src1_mods); // src0_modifiers - HiMul_MI.addReg( - Src1_Hi, - RegState::Kill); // select higher 16 bits from src1 if op_sel_hi is set - HiMul_MI.addImm(ClampVal); // clamp - // packed instructions do not support output modifiers. safe to assign them 0 - // for this use case - HiMul_MI.addImm(0); // omod - - // Shift HiMul left by 16 - Register HiMulShifted = MRI.createVirtualRegister(RC); - MachineInstrBuilder HiMulShifted_MI = - BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHLREV_B32_e64), HiMulShifted) - .addReg(ShiftAmt) - .addReg(HiMul); - - SubRegID = 0; - if (DstMO.getSubReg()) - SubRegID = DstMO.getSubReg(); - // OR LoMul | (HiMul << 16) - MachineInstrBuilder RewriteBackToDst_MI = - BuildMI(MBB, I, DL, TII->get(AMDGPU::V_OR_B32_e64)); - if (SubRegID) { - if (DstMO.isUndef()) { - RewriteBackToDst_MI.addDef(DstReg, RegState::Undef, SubRegID); - } else { - RewriteBackToDst_MI.addDef(DstReg, 0, SubRegID); - } - } else { - if (DstMO.isUndef()) { - RewriteBackToDst_MI.addDef(DstReg, RegState::Undef); - } else { - RewriteBackToDst_MI.addDef(DstReg); - } - } - RewriteBackToDst_MI.addReg(LoMul); - RewriteBackToDst_MI.addReg(HiMulShifted); - - LIS->InsertMachineInstrInMaps(*LoMul_MI); - LIS->InsertMachineInstrInMaps(*HiMul_MI); - LIS->InsertMachineInstrInMaps(*HiMulShifted_MI); - LIS->InsertMachineInstrInMaps(*RewriteBackToDst_MI); - LIS->RemoveMachineInstrFromMaps(I); - I.eraseFromParent(); - LIS->removeInterval(DstReg); - LIS->createAndComputeVirtRegInterval(DstReg); -} - bool GCNPreRAOptimizationsLegacy::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; @@ -869,7 +594,6 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) { SetVector InstrsToUnpack; SetVector WriteOperands; SetVector ReadOperands; - IsF16MaskSet = false; uint16_t NumMFMACycles = 0; auto SchedModel = TII->getSchedModel(); for (MachineInstr &MI : MBB) { @@ -910,12 +634,7 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) { if (!InstrsToUnpack.empty()) { for (MachineInstr *MI : InstrsToUnpack) { - if ((MI->getOpcode() == AMDGPU::V_PK_MUL_F16) || - (MI->getOpcode() == AMDGPU::V_PK_ADD_F16)) { - processF16Unpacking(*MI, NumMFMACycles); - } else { - processF32Unpacking(*MI); - } + processF32Unpacking(*MI); } } } diff --git a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.mir b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.mir index b13f61a963ed5..6b871b1d1881b 100644 --- a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.mir +++ b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.mir @@ -152,58 +152,3 @@ body: | %179.sub0_sub1:vreg_128_align2 = nofpexcept V_PK_MUL_F32 0, %24.sub4_sub5:sgpr_512, 8, %75:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec S_ENDPGM 0 -... ---- -name: test_only_overlapped_unpacking_f16 -tracksRegLiveness: true -liveins: - - { reg: '$sgpr4_sgpr5', virtual-reg: '%3' } -body: | - bb.0.entry: - liveins: $sgpr4_sgpr5 - ; GCN-LABEL: name: test_only_overlapped_unpacking_f16 - ; GCN: liveins: $sgpr4_sgpr5 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 - ; GCN-NEXT: early-clobber %1:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[COPY]](p4), 0, 0 - ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1.sub4_sub5, 0, 0 - ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1.sub6_sub7, 0, 0 - ; GCN-NEXT: early-clobber %4:sgpr_256 = S_LOAD_DWORDX8_IMM_ec %1.sub6_sub7, 0, 0 - ; GCN-NEXT: dead [[AV_MOV_:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec - ; GCN-NEXT: early-clobber %6:sgpr_256 = S_LOAD_DWORDX8_IMM_ec %1.sub4_sub5, 0, 0 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[S_LOAD_DWORDX2_IMM]] - ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY [[S_LOAD_DWORDX2_IMM1]] - ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY %4.sub7 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY %4.sub6 - ; GCN-NEXT: undef [[V_PK_MUL_F16_:%[0-9]+]].sub3:vreg_128_align2 = nofpexcept V_PK_MUL_F16 8, %6.sub7, 8, [[COPY3]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: dead early-clobber %12:areg_256_align2 = V_MFMA_F64_16X16X4F64_e64 [[COPY1]], [[COPY2]], 0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec - ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec - ; GCN-NEXT: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 %6.sub6, [[V_MOV_B32_e32_]], implicit $exec - ; GCN-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_1]], %6.sub6, implicit $exec - ; GCN-NEXT: [[V_AND_B32_e32_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 [[COPY4]], [[V_MOV_B32_e32_]], implicit $exec - ; GCN-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_1]], [[COPY4]], implicit $exec - ; GCN-NEXT: [[V_MUL_F16_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F16_e64 0, killed [[V_AND_B32_e32_]], 0, killed [[V_AND_B32_e32_1]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: [[V_MUL_F16_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_F16_e64 0, killed [[V_LSHRREV_B32_e64_]], 0, killed [[V_LSHRREV_B32_e64_1]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_1]], [[V_MUL_F16_e64_1]], implicit $exec - ; GCN-NEXT: [[V_PK_MUL_F16_:%[0-9]+]].sub2:vreg_128_align2 = V_OR_B32_e64 [[V_MUL_F16_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec - ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY %4.sub5 - ; GCN-NEXT: dead [[V_PK_MUL_F16_:%[0-9]+]].sub1:vreg_128_align2 = nofpexcept V_PK_MUL_F16 8, %6.sub5, 8, [[COPY5]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_ENDPGM 0 - %3:sgpr_64(p4) = COPY $sgpr4_sgpr5 - early-clobber %8:sgpr_256 = S_LOAD_DWORDX8_IMM_ec %3(p4), 0, 0 - %22:sreg_64_xexec = S_LOAD_DWORDX2_IMM %8.sub4_sub5, 0, 0 - %23:sreg_64_xexec = S_LOAD_DWORDX2_IMM %8.sub6_sub7, 0, 0 - early-clobber %25:sgpr_256 = S_LOAD_DWORDX8_IMM_ec %8.sub6_sub7, 0, 0 - %12:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec - early-clobber %24:sgpr_256 = S_LOAD_DWORDX8_IMM_ec %8.sub4_sub5, 0, 0 - %29:vreg_64_align2 = COPY %22 - %30:vreg_64_align2 = COPY %23 - %51:vgpr_32 = COPY %25.sub7 - %55:vgpr_32 = COPY %25.sub6 - undef %99.sub3:vreg_128_align2 = nofpexcept V_PK_MUL_F16 8, %24.sub7, 8, %51, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - %28:areg_256_align2 = V_MFMA_F64_16X16X4F64_e64 %29, %30, 0, 0, 0, 0, implicit $mode, implicit $exec - %99.sub2:vreg_128_align2 = nofpexcept V_PK_MUL_F16 8, %24.sub6, 8, %55, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - %59:vgpr_32 = COPY %25.sub5 - %99.sub1:vreg_128_align2 = nofpexcept V_PK_MUL_F16 8, %24.sub5, 8, %59, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - S_ENDPGM 0 From 8ed311eb3c7530b996e350b933a037fa409b677d Mon Sep 17 00:00:00 2001 From: Akash Dutta Date: Tue, 2 Sep 2025 19:30:43 -0500 Subject: [PATCH 15/16] code cleanup, add code comments --- .../Target/AMDGPU/GCNPreRAOptimizations.cpp | 31 ++++++++++++------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp index e2d1fc073bd73..281208a143161 100644 --- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp +++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp @@ -65,22 +65,39 @@ class GCNPreRAOptimizationsImpl { LiveIntervals *LIS; bool processReg(Register Reg); + // creates a list of packed instructions following an MFMA that are suitable + // for unpacking bool createListOfPackedInstr(MachineInstr &BeginMI, SetVector &InstrsToUnpack, uint16_t NumMFMACycles); + // check if the machine instruction being processed is a supported packed + // instruction bool isUnpackingSupportedInstr(MachineInstr &MI) const; + // function to perform unpacking of F32 packed instructions with 2 source + // operands, such as V_PK_MUL and V_PK_ADD. Currently, only V_PK_MUL and + // V_PK_ADD are supported for this transformation void processF32Unpacking(MachineInstr &I); + // select corresponding unpacked instruction from packed instruction as input uint16_t mapToUnpackedOpcode(MachineInstr &I); - + // inserts appropriate unpacked instructions into the BB void insertUnpackedF32MI(MachineInstr &I, MachineOperand &DstMO, MachineOperand &LoSrcMO1, MachineOperand &LoSrcMO2, MachineOperand &HiSrcMO1, MachineOperand &HiSrcMO2, bool isVreg_64); + // function to perform unpacking of F32 packed instructions with 3 source + // operands, such as V_PK_FMA. Currently, only V_PK_FMA is supported for this + // transformation void processFMAF32Unpacking(MachineInstr &I); + // creates the unpacked instruction to be inserted. Adds source modifiers to + // the unpacked instructions based on the source modifiers in the packed + // instruction MachineInstrBuilder createUnpackedMI(MachineBasicBlock &MBB, MachineInstr &I, const DebugLoc &DL, uint16_t UnpackedOpcode, bool isHiBits, bool isFMA); + // checks if there are register dependencies between those used by the MFMA + // instruction and the following packed instructions. Conservatively ensures + // that we do not incorrectly read/write registers. bool hasReadWriteDependencies(const MachineInstr &PredMI, const MachineInstr &SuccMI); @@ -312,8 +329,6 @@ bool GCNPreRAOptimizationsImpl::createListOfPackedInstr( MachineInstr &BeginMI, SetVector &InstrsToUnpack, uint16_t NumMFMACycles) { auto *BB = BeginMI.getParent(); - auto *MF = BB->getParent(); - int NumInst = 0; auto E = BB->end(); int TotalCyclesBetweenCandidates = 0; auto SchedModel = TII->getSchedModel(); @@ -397,7 +412,7 @@ void GCNPreRAOptimizationsImpl::processFMAF32Unpacking(MachineInstr &I) { MachineBasicBlock &MBB = *I.getParent(); Register DstReg = I.getOperand(0).getReg(); const DebugLoc &DL = I.getDebugLoc(); - const TargetRegisterClass *DstRC = MRI->getRegClass(I.getOperand(0).getReg()); + const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg); bool IsVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID); uint16_t UnpackedOpcode = mapToUnpackedOpcode(I); @@ -440,9 +455,6 @@ MachineInstrBuilder GCNPreRAOptimizationsImpl::createUnpackedMI( MachineOperand &SrcMO1 = I.getOperand(2); MachineOperand &SrcMO2 = I.getOperand(4); Register DstReg = DstMO.getReg(); - Register SrcReg1 = SrcMO1.getReg(); - Register SrcReg2 = SrcMO2.getReg(); - const TargetRegisterClass *DstRC = MRI->getRegClass(DstMO.getReg()); unsigned DestSubIdx = isHiBits ? TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub1) : TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub0); @@ -498,7 +510,6 @@ MachineInstrBuilder GCNPreRAOptimizationsImpl::createUnpackedMI( if (isFMA) { MachineOperand &SrcMO3 = I.getOperand(6); - Register SrcReg3 = SrcMO3.getReg(); int Src2_modifiers_Idx = AMDGPU::getNamedOperandIdx( I.getOpcode(), AMDGPU::OpName::src2_modifiers); unsigned Src2_Mods = I.getOperand(Src2_modifiers_Idx).getImm(); @@ -538,14 +549,12 @@ void GCNPreRAOptimizationsImpl::processF32Unpacking(MachineInstr &I) { processFMAF32Unpacking(I); return; } - MachineBasicBlock &MBB = *I.getParent(); MachineOperand &DstMO = I.getOperand(0); MachineOperand &SrcMO1 = I.getOperand(2); MachineOperand &SrcMO2 = I.getOperand(4); - const DebugLoc &DL = I.getDebugLoc(); - const TargetRegisterClass *DstRC = MRI->getRegClass(I.getOperand(0).getReg()); + const TargetRegisterClass *DstRC = MRI->getRegClass(DstMO.getReg()); bool IsVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID); insertUnpackedF32MI(I, DstMO, SrcMO1, SrcMO2, SrcMO1, SrcMO2, IsVReg64); From 064fa84c1853e9071d660d7ed73bda597ded13a4 Mon Sep 17 00:00:00 2001 From: Akash Dutta Date: Thu, 4 Sep 2025 12:54:53 -0500 Subject: [PATCH 16/16] add support for immediate operands, modularize code --- .../Target/AMDGPU/GCNPreRAOptimizations.cpp | 227 ++++++------------ 1 file changed, 77 insertions(+), 150 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp index 281208a143161..ed52a56355486 100644 --- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp +++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp @@ -39,7 +39,6 @@ #include "GCNPreRAOptimizations.h" #include "AMDGPU.h" -#include "GCNSchedStrategy.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIInstrInfo.h" @@ -48,8 +47,6 @@ #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" -#include "llvm/CodeGen/MachineScheduler.h" -#include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/InitializePasses.h" using namespace llvm; @@ -65,42 +62,39 @@ class GCNPreRAOptimizationsImpl { LiveIntervals *LIS; bool processReg(Register Reg); - // creates a list of packed instructions following an MFMA that are suitable - // for unpacking + // Creates a list of packed instructions following an MFMA that are suitable + // for unpacking. bool createListOfPackedInstr(MachineInstr &BeginMI, SetVector &InstrsToUnpack, uint16_t NumMFMACycles); - // check if the machine instruction being processed is a supported packed + // Check if the machine instruction being processed is a supported packed // instruction bool isUnpackingSupportedInstr(MachineInstr &MI) const; - // function to perform unpacking of F32 packed instructions with 2 source - // operands, such as V_PK_MUL and V_PK_ADD. Currently, only V_PK_MUL and - // V_PK_ADD are supported for this transformation + // Unpack F32 packed instructions, such as V_PK_MUL, V_PK_ADD, and V_PK_FMA. + // Currently, only V_PK_MUL, V_PK_ADD, V_PK_FMA are supported for this + // transformation. void processF32Unpacking(MachineInstr &I); - // select corresponding unpacked instruction from packed instruction as input + // Select corresponding unpacked instruction from packed instruction as input uint16_t mapToUnpackedOpcode(MachineInstr &I); - // inserts appropriate unpacked instructions into the BB - void insertUnpackedF32MI(MachineInstr &I, MachineOperand &DstMO, - MachineOperand &LoSrcMO1, MachineOperand &LoSrcMO2, - MachineOperand &HiSrcMO1, MachineOperand &HiSrcMO2, - bool isVreg_64); - // function to perform unpacking of F32 packed instructions with 3 source - // operands, such as V_PK_FMA. Currently, only V_PK_FMA is supported for this - // transformation - void processFMAF32Unpacking(MachineInstr &I); - // creates the unpacked instruction to be inserted. Adds source modifiers to + // Insert appropriate unpacked instructions into the BB + void insertUnpackedF32MI(MachineInstr &I, bool IsVreg_64, bool IsFMA); + // Creates the unpacked instruction to be inserted. Adds source modifiers to // the unpacked instructions based on the source modifiers in the packed // instruction MachineInstrBuilder createUnpackedMI(MachineBasicBlock &MBB, MachineInstr &I, const DebugLoc &DL, - uint16_t UnpackedOpcode, bool isHiBits, - bool isFMA); - // checks if there are register dependencies between those used by the MFMA + uint16_t UnpackedOpcode, bool IsHiBits, + bool IsFMA); + // Identify register dependencies between those used by the MFMA // instruction and the following packed instructions. Conservatively ensures // that we do not incorrectly read/write registers. bool hasReadWriteDependencies(const MachineInstr &PredMI, const MachineInstr &SuccMI); + void addOperandandMods(MachineInstrBuilder NewMI, unsigned Src_Mods, + unsigned NegModifier, unsigned OpSelModifier, + MachineOperand &SrcMO); + public: GCNPreRAOptimizationsImpl(LiveIntervals *LS) : LIS(LS) {} bool run(MachineFunction &MF); @@ -278,18 +272,17 @@ bool GCNPreRAOptimizationsImpl::isUnpackingSupportedInstr( switch (Opcode) { case AMDGPU::V_PK_ADD_F32: case AMDGPU::V_PK_MUL_F32: - return (MI.getOperand(2).isReg() && MI.getOperand(4).isReg()); case AMDGPU::V_PK_FMA_F32: - return (MI.getOperand(2).isReg() && MI.getOperand(4).isReg() && - MI.getOperand(6).isReg()); + return true; default: return false; } + llvm_unreachable("Fully covered switch"); } uint16_t GCNPreRAOptimizationsImpl::mapToUnpackedOpcode(MachineInstr &I) { unsigned Opcode = I.getOpcode(); - // use 64 bit encoding to allow use of VOP3 instructions. + // Use 64 bit encoding to allow use of VOP3 instructions. // VOP3 instructions allow VOP3P source modifiers to be translated to VOP3 // e32 instructions are VOP2 and don't allow source modifiers switch (Opcode) { @@ -302,6 +295,7 @@ uint16_t GCNPreRAOptimizationsImpl::mapToUnpackedOpcode(MachineInstr &I) { default: return std::numeric_limits::max(); } + llvm_unreachable("Fully covered switch"); } bool GCNPreRAOptimizationsImpl::hasReadWriteDependencies( @@ -349,7 +343,7 @@ bool GCNPreRAOptimizationsImpl::createListOfPackedInstr( if (hasReadWriteDependencies(BeginMI, Instr)) return false; - // if it is a packed instruction, we should subtract it's latency from the + // If it is a packed instruction, we should subtract it's latency from the // overall latency calculation here, because the packed instruction will // be removed and replaced by 2 unpacked instructions TotalCyclesBetweenCandidates -= @@ -359,37 +353,32 @@ bool GCNPreRAOptimizationsImpl::createListOfPackedInstr( // instructions have latency of 1. // TODO: improve latency handling of possible inserted instructions TotalCyclesBetweenCandidates += 2; - if (!(TotalCyclesBetweenCandidates >= NumMFMACycles)) + if (!(TotalCyclesBetweenCandidates >= NumMFMACycles - 1)) InstrsToUnpack.insert(&Instr); } } return true; } -void GCNPreRAOptimizationsImpl::insertUnpackedF32MI( - MachineInstr &I, MachineOperand &DstMO, MachineOperand &LoSrcMO1, - MachineOperand &LoSrcMO2, MachineOperand &HiSrcMO1, - MachineOperand &HiSrcMO2, bool IsVreg_64) { - +void GCNPreRAOptimizationsImpl::insertUnpackedF32MI(MachineInstr &I, + bool IsVreg_64, + bool IsFMA) { MachineBasicBlock &MBB = *I.getParent(); const DebugLoc &DL = I.getDebugLoc(); - Register DstReg = DstMO.getReg(); + Register DstReg = I.getOperand(0).getReg(); uint16_t UnpackedOpcode = mapToUnpackedOpcode(I); if (UnpackedOpcode == std::numeric_limits::max()) return; - MachineInstrBuilder Op0L_Op1L = - createUnpackedMI(MBB, I, DL, UnpackedOpcode, false, false); - if (IsVreg_64) { + MachineInstrBuilder Op0L_Op1L = createUnpackedMI( + MBB, I, DL, UnpackedOpcode, /*IsHiBits=*/false, /*IsFMA=*/IsFMA); + if (IsVreg_64 || I.getOperand(0).isUndef()) Op0L_Op1L->getOperand(0).setIsUndef(); - } else if (DstMO.isUndef()) { - Op0L_Op1L->getOperand(0).setIsUndef(); - } LIS->InsertMachineInstrInMaps(*Op0L_Op1L); - MachineInstrBuilder Op0H_Op1H = - createUnpackedMI(MBB, I, DL, UnpackedOpcode, true, false); + MachineInstrBuilder Op0H_Op1H = createUnpackedMI( + MBB, I, DL, UnpackedOpcode, /*IsHiBits=*/true, /*IsFMA=*/IsFMA); LIS->InsertMachineInstrInMaps(*Op0H_Op1H); if (I.getFlag(MachineInstr::MIFlag::NoFPExcept)) { @@ -408,55 +397,52 @@ void GCNPreRAOptimizationsImpl::insertUnpackedF32MI( return; } -void GCNPreRAOptimizationsImpl::processFMAF32Unpacking(MachineInstr &I) { - MachineBasicBlock &MBB = *I.getParent(); - Register DstReg = I.getOperand(0).getReg(); - const DebugLoc &DL = I.getDebugLoc(); - const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg); - bool IsVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID); - - uint16_t UnpackedOpcode = mapToUnpackedOpcode(I); - if (UnpackedOpcode == std::numeric_limits::max()) - return; - - MachineInstrBuilder Op0L_Op1L = - createUnpackedMI(MBB, I, DL, UnpackedOpcode, false, true); - if (IsVReg64) - Op0L_Op1L->getOperand(0).setIsUndef(); - else if (I.getOperand(0).isUndef()) { - Op0L_Op1L->getOperand(0).setIsUndef(); - } - LIS->InsertMachineInstrInMaps(*Op0L_Op1L); - - MachineInstrBuilder Op0H_Op1H = - createUnpackedMI(MBB, I, DL, UnpackedOpcode, true, true); - LIS->InsertMachineInstrInMaps(*Op0H_Op1H); - - if (I.getFlag(MachineInstr::MIFlag::NoFPExcept)) { - Op0L_Op1L->setFlag(MachineInstr::MIFlag::NoFPExcept); - Op0H_Op1H->setFlag(MachineInstr::MIFlag::NoFPExcept); +void GCNPreRAOptimizationsImpl::addOperandandMods(MachineInstrBuilder NewMI, + unsigned Src_Mods, + unsigned NegModifier, + unsigned OpSelModifier, + MachineOperand &SrcMO) { + unsigned New_Src_Mods = 0; + // If NEG or NEG_HI is true, we need to negate the corresponding 32 bit + // lane. + // NEG_HI shares the same bit position with ABS. But packed instructions do + // not support ABS. Therefore, NEG_HI must be translated to NEG source + // modifier for the higher 32 bits. Unpacked VOP3 instructions do support + // ABS, therefore we need to explicitly add the NEG modifier if present in + // the packed instruction + if (Src_Mods & NegModifier) { + New_Src_Mods |= SISrcMods::NEG; } - if (I.getFlag(MachineInstr::MIFlag::FmContract)) { - Op0L_Op1L->setFlag(MachineInstr::MIFlag::FmContract); - Op0H_Op1H->setFlag(MachineInstr::MIFlag::FmContract); + // Src modifiers. Only negative modifiers are added if needed. Unpacked + // operations do not have op_sel, therefore it must be handled explicitly as + // done below. Unpacked operations support abs, but packed instructions do + // not. Thus, abs is not handled. + NewMI.addImm(New_Src_Mods); + if (SrcMO.isImm()) { + NewMI.addImm(SrcMO.getImm()); + } else { + if (Src_Mods & OpSelModifier) { + unsigned Src0SubIdx = + TRI->composeSubRegIndices(SrcMO.getSubReg(), AMDGPU::sub1); + NewMI.addReg(SrcMO.getReg(), 0, Src0SubIdx); // src0 + } else { + unsigned Src0SubIdx = + TRI->composeSubRegIndices(SrcMO.getSubReg(), AMDGPU::sub0); + // If op_sel == 0, select register 0 of reg:sub0_sub1 + NewMI.addReg(SrcMO.getReg(), 0, Src0SubIdx); + } } - - LIS->RemoveMachineInstrFromMaps(I); - I.eraseFromParent(); - LIS->removeInterval(DstReg); - LIS->createAndComputeVirtRegInterval(DstReg); - return; } MachineInstrBuilder GCNPreRAOptimizationsImpl::createUnpackedMI( MachineBasicBlock &MBB, MachineInstr &I, const DebugLoc &DL, - uint16_t UnpackedOpcode, bool isHiBits, bool isFMA) { + uint16_t UnpackedOpcode, bool IsHiBits, bool IsFMA) { MachineOperand &DstMO = I.getOperand(0); MachineOperand &SrcMO1 = I.getOperand(2); MachineOperand &SrcMO2 = I.getOperand(4); Register DstReg = DstMO.getReg(); unsigned DestSubIdx = - isHiBits ? TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub1) + IsHiBits ? TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub1) : TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub0); int ClampIdx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::clamp); @@ -472,92 +458,33 @@ MachineInstrBuilder GCNPreRAOptimizationsImpl::createUnpackedMI( unsigned New_Src0_Mods = 0; unsigned New_Src1_Mods = 0; - unsigned NegModifier = isHiBits ? SISrcMods::NEG_HI : SISrcMods::NEG; - unsigned OpSelModifier = isHiBits ? SISrcMods::OP_SEL_1 : SISrcMods::OP_SEL_0; + unsigned NegModifier = IsHiBits ? SISrcMods::NEG_HI : SISrcMods::NEG; + unsigned OpSelModifier = IsHiBits ? SISrcMods::OP_SEL_1 : SISrcMods::OP_SEL_0; MachineInstrBuilder NewMI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode)); NewMI.addDef(DstReg, 0, DestSubIdx); // vdst - if (Src0_Mods & NegModifier) { - New_Src0_Mods |= SISrcMods::NEG; - } - NewMI.addImm(New_Src0_Mods); // src0_modifiers - - if (Src0_Mods & OpSelModifier) { - unsigned Src0SubIdx = - TRI->composeSubRegIndices(SrcMO1.getSubReg(), AMDGPU::sub1); - NewMI.addReg(SrcMO1.getReg(), 0, Src0SubIdx); // src0 - } else { - unsigned Src0SubIdx = - TRI->composeSubRegIndices(SrcMO1.getSubReg(), AMDGPU::sub0); - // if op_sel == 0, select register 0 of reg:sub0_sub1 - NewMI.addReg(SrcMO1.getReg(), 0, Src0SubIdx); - } - - if (Src1_Mods & NegModifier) { - New_Src1_Mods |= SISrcMods::NEG; - } - NewMI.addImm(New_Src1_Mods); // src1_modifiers - if (Src1_Mods & OpSelModifier) { - unsigned Src1SubIdx = - TRI->composeSubRegIndices(SrcMO2.getSubReg(), AMDGPU::sub1); - NewMI.addReg(SrcMO2.getReg(), 0, Src1SubIdx); // src0 - } else { - // if op_sel_hi == 0, select register 0 of reg:sub0_sub1 - unsigned Src1SubIdx = - TRI->composeSubRegIndices(SrcMO2.getSubReg(), AMDGPU::sub0); - NewMI.addReg(SrcMO2.getReg(), 0, Src1SubIdx); - } + addOperandandMods(NewMI, Src0_Mods, NegModifier, OpSelModifier, SrcMO1); + addOperandandMods(NewMI, Src1_Mods, NegModifier, OpSelModifier, SrcMO2); - if (isFMA) { + if (IsFMA) { MachineOperand &SrcMO3 = I.getOperand(6); int Src2_modifiers_Idx = AMDGPU::getNamedOperandIdx( I.getOpcode(), AMDGPU::OpName::src2_modifiers); unsigned Src2_Mods = I.getOperand(Src2_modifiers_Idx).getImm(); - unsigned New_Src2_Mods = 0; - // If NEG or NEG_HI is true, we need to negate the corresponding 32 bit - // lane. - // This is also true for NEG_HI as it shares the same bit position with - // ABS. But packed instructions do not support ABS. Therefore, NEG_HI must - // be translated to NEG source modifier for the higher 32 bits. - // Unpacked VOP3 instructions do support ABS, therefore we need to - // explicitly add the NEG modifier if present in the packed instruction - if (Src2_Mods & NegModifier) { - // New_Src2_Mods |= NegModifier; - New_Src2_Mods |= SISrcMods::NEG; - } - NewMI.addImm(New_Src2_Mods); // src2_modifiers - if (Src2_Mods & OpSelModifier) { - unsigned Src2SubIdx = - TRI->composeSubRegIndices(SrcMO3.getSubReg(), AMDGPU::sub1); - NewMI.addReg(SrcMO3.getReg(), 0, Src2SubIdx); - } else { - unsigned Src2SubIdx = - TRI->composeSubRegIndices(SrcMO3.getSubReg(), AMDGPU::sub0); - // if op_sel_hi == 0, select register 0 of reg:sub0_sub1 - NewMI.addReg(SrcMO3.getReg(), 0, Src2SubIdx); - } + addOperandandMods(NewMI, Src2_Mods, NegModifier, OpSelModifier, SrcMO3); } NewMI.addImm(ClampVal); // clamp - // packed instructions do not support output modifiers. safe to assign them 0 + // Packed instructions do not support output modifiers. safe to assign them 0 // for this use case NewMI.addImm(0); // omod return NewMI; } void GCNPreRAOptimizationsImpl::processF32Unpacking(MachineInstr &I) { - if (I.getOpcode() == AMDGPU::V_PK_FMA_F32) { - processFMAF32Unpacking(I); - return; - } - - MachineOperand &DstMO = I.getOperand(0); - MachineOperand &SrcMO1 = I.getOperand(2); - MachineOperand &SrcMO2 = I.getOperand(4); - - const TargetRegisterClass *DstRC = MRI->getRegClass(DstMO.getReg()); - + bool IsFMA = (I.getOpcode() == AMDGPU::V_PK_FMA_F32) ? true : false; + const TargetRegisterClass *DstRC = MRI->getRegClass(I.getOperand(0).getReg()); bool IsVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID); - insertUnpackedF32MI(I, DstMO, SrcMO1, SrcMO2, SrcMO1, SrcMO2, IsVReg64); + insertUnpackedF32MI(I, IsVReg64, IsFMA); return; }