diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 633a858df364e..92937e33fd500 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -119,9 +119,22 @@ class SIFoldOperandsImpl { MachineOperand *OpToFold) const; bool isUseSafeToFold(const MachineInstr &MI, const MachineOperand &UseMO) const; - bool + + const TargetRegisterClass *getRegSeqInit( + MachineInstr &RegSeq, + SmallVectorImpl> &Defs) const; + + const TargetRegisterClass * getRegSeqInit(SmallVectorImpl> &Defs, - Register UseReg, uint8_t OpTy) const; + Register UseReg) const; + + std::pair + isRegSeqSplat(MachineInstr &RegSeg) const; + + MachineOperand *tryFoldRegSeqSplat(MachineInstr *UseMI, unsigned UseOpIdx, + MachineOperand *SplatVal, + const TargetRegisterClass *SplatRC) const; + bool tryToFoldACImm(MachineOperand &OpToFold, MachineInstr *UseMI, unsigned UseOpIdx, SmallVectorImpl &FoldList) const; @@ -825,19 +838,24 @@ static MachineOperand *lookUpCopyChain(const SIInstrInfo &TII, return Sub; } -// Find a def of the UseReg, check if it is a reg_sequence and find initializers -// for each subreg, tracking it to foldable inline immediate if possible. -// Returns true on success. -bool SIFoldOperandsImpl::getRegSeqInit( - SmallVectorImpl> &Defs, - Register UseReg, uint8_t OpTy) const { - MachineInstr *Def = MRI->getVRegDef(UseReg); - if (!Def || !Def->isRegSequence()) - return false; +const TargetRegisterClass *SIFoldOperandsImpl::getRegSeqInit( + MachineInstr &RegSeq, + SmallVectorImpl> &Defs) const { - for (unsigned I = 1, E = Def->getNumExplicitOperands(); I != E; I += 2) { - MachineOperand &SrcOp = Def->getOperand(I); - unsigned SubRegIdx = Def->getOperand(I + 1).getImm(); + assert(RegSeq.isRegSequence()); + + const TargetRegisterClass *RC = nullptr; + + for (unsigned I = 1, E = RegSeq.getNumExplicitOperands(); I != E; I += 2) { + MachineOperand &SrcOp = RegSeq.getOperand(I); + unsigned SubRegIdx = RegSeq.getOperand(I + 1).getImm(); + + // Only accept reg_sequence with uniform reg class inputs for simplicity. + const TargetRegisterClass *OpRC = getRegOpRC(*MRI, *TRI, SrcOp); + if (!RC) + RC = OpRC; + else if (!TRI->getCommonSubClass(RC, OpRC)) + return nullptr; if (SrcOp.getSubReg()) { // TODO: Handle subregister compose @@ -846,8 +864,7 @@ bool SIFoldOperandsImpl::getRegSeqInit( } MachineOperand *DefSrc = lookUpCopyChain(*TII, *MRI, SrcOp.getReg()); - if (DefSrc && (DefSrc->isReg() || - (DefSrc->isImm() && TII->isInlineConstant(*DefSrc, OpTy)))) { + if (DefSrc && (DefSrc->isReg() || DefSrc->isImm())) { Defs.emplace_back(DefSrc, SubRegIdx); continue; } @@ -855,7 +872,65 @@ bool SIFoldOperandsImpl::getRegSeqInit( Defs.emplace_back(&SrcOp, SubRegIdx); } - return true; + return RC; +} + +// Find a def of the UseReg, check if it is a reg_sequence and find initializers +// for each subreg, tracking it to an immediate if possible. Returns the +// register class of the inputs on success. +const TargetRegisterClass *SIFoldOperandsImpl::getRegSeqInit( + SmallVectorImpl> &Defs, + Register UseReg) const { + MachineInstr *Def = MRI->getVRegDef(UseReg); + if (!Def || !Def->isRegSequence()) + return nullptr; + + return getRegSeqInit(*Def, Defs); +} + +std::pair +SIFoldOperandsImpl::isRegSeqSplat(MachineInstr &RegSeq) const { + SmallVector, 32> Defs; + const TargetRegisterClass *SrcRC = getRegSeqInit(RegSeq, Defs); + if (!SrcRC) + return {}; + + int64_t Imm; + for (unsigned I = 0, E = Defs.size(); I != E; ++I) { + const MachineOperand *Op = Defs[I].first; + if (!Op->isImm()) + return {}; + + int64_t SubImm = Op->getImm(); + if (!I) { + Imm = SubImm; + continue; + } + if (Imm != SubImm) + return {}; // Can only fold splat constants + } + + return {Defs[0].first, SrcRC}; +} + +MachineOperand *SIFoldOperandsImpl::tryFoldRegSeqSplat( + MachineInstr *UseMI, unsigned UseOpIdx, MachineOperand *SplatVal, + const TargetRegisterClass *SplatRC) const { + const MCInstrDesc &Desc = UseMI->getDesc(); + if (UseOpIdx >= Desc.getNumOperands()) + return nullptr; + + // Filter out unhandled pseudos. + if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx)) + return nullptr; + + // FIXME: Verify SplatRC is compatible with the use operand + uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType; + if (!TII->isInlineConstant(*SplatVal, OpTy) || + !TII->isOperandLegal(*UseMI, UseOpIdx, SplatVal)) + return nullptr; + + return SplatVal; } bool SIFoldOperandsImpl::tryToFoldACImm( @@ -869,7 +944,6 @@ bool SIFoldOperandsImpl::tryToFoldACImm( if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx)) return false; - uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType; MachineOperand &UseOp = UseMI->getOperand(UseOpIdx); if (OpToFold.isImm()) { if (unsigned UseSubReg = UseOp.getSubReg()) { @@ -916,31 +990,7 @@ bool SIFoldOperandsImpl::tryToFoldACImm( } } - SmallVector, 32> Defs; - if (!getRegSeqInit(Defs, UseReg, OpTy)) - return false; - - int32_t Imm; - for (unsigned I = 0, E = Defs.size(); I != E; ++I) { - const MachineOperand *Op = Defs[I].first; - if (!Op->isImm()) - return false; - - auto SubImm = Op->getImm(); - if (!I) { - Imm = SubImm; - if (!TII->isInlineConstant(*Op, OpTy) || - !TII->isOperandLegal(*UseMI, UseOpIdx, Op)) - return false; - - continue; - } - if (Imm != SubImm) - return false; // Can only fold splat constants - } - - appendFoldCandidate(FoldList, UseMI, UseOpIdx, Defs[0].first); - return true; + return false; } void SIFoldOperandsImpl::foldOperand( @@ -970,14 +1020,26 @@ void SIFoldOperandsImpl::foldOperand( Register RegSeqDstReg = UseMI->getOperand(0).getReg(); unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm(); + MachineOperand *SplatVal; + const TargetRegisterClass *SplatRC; + std::tie(SplatVal, SplatRC) = isRegSeqSplat(*UseMI); + // Grab the use operands first SmallVector UsesToProcess( llvm::make_pointer_range(MRI->use_nodbg_operands(RegSeqDstReg))); for (auto *RSUse : UsesToProcess) { MachineInstr *RSUseMI = RSUse->getParent(); + unsigned OpNo = RSUseMI->getOperandNo(RSUse); - if (tryToFoldACImm(UseMI->getOperand(0), RSUseMI, - RSUseMI->getOperandNo(RSUse), FoldList)) + if (SplatVal) { + if (MachineOperand *Foldable = + tryFoldRegSeqSplat(RSUseMI, OpNo, SplatVal, SplatRC)) { + appendFoldCandidate(FoldList, RSUseMI, OpNo, Foldable); + continue; + } + } + + if (tryToFoldACImm(UseMI->getOperand(0), RSUseMI, OpNo, FoldList)) continue; if (RSUse->getSubReg() != RegSeqDstSubReg) @@ -986,6 +1048,7 @@ void SIFoldOperandsImpl::foldOperand( foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(RSUse), FoldList, CopiesToReplace); } + return; } @@ -2137,7 +2200,7 @@ bool SIFoldOperandsImpl::tryFoldRegSequence(MachineInstr &MI) { return false; SmallVector, 32> Defs; - if (!getRegSeqInit(Defs, Reg, MCOI::OPERAND_REGISTER)) + if (!getRegSeqInit(Defs, Reg)) return false; for (auto &[Op, SubIdx] : Defs) { diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll index ecd1abce67571..ddc3e770767b8 100644 --- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll +++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll @@ -1680,10 +1680,12 @@ define amdgpu_kernel void @fma_v2_v_lit_splat(ptr addrspace(1) %a) { ; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-GISEL-NEXT: s_mov_b32 s2, 1.0 +; PACKED-GISEL-NEXT: s_mov_b32 s3, s2 ; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] ; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) -; PACKED-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], 4.0, 1.0 +; PACKED-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], 4.0, s[2:3] ; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x()