From 1fa4482ee6d61d02c76c2955fb6077262c178cce Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 27 Feb 2025 09:12:16 +0700 Subject: [PATCH 1/3] AMDGPU: Factor agpr reg_sequence folding into a function --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 168 ++++++++++++---------- 1 file changed, 90 insertions(+), 78 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 3a019dbaad02c..0e41a78c2c8ae 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -128,6 +128,8 @@ class SIFoldOperandsImpl { bool tryFoldCndMask(MachineInstr &MI) const; bool tryFoldZeroHighBits(MachineInstr &MI) const; bool foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const; + + bool foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const; bool tryFoldFoldableCopy(MachineInstr &MI, MachineOperand *&CurrentKnownM0Val) const; @@ -1012,7 +1014,6 @@ void SIFoldOperandsImpl::foldOperand( UseMI->getOperand(0).getReg().isVirtual() && !UseMI->getOperand(1).getSubReg()) { LLVM_DEBUG(dbgs() << "Folding " << OpToFold << "\n into " << *UseMI); - unsigned Size = TII->getOpSize(*UseMI, 1); Register UseReg = OpToFold.getReg(); UseMI->getOperand(1).setReg(UseReg); UseMI->getOperand(1).setSubReg(OpToFold.getSubReg()); @@ -1022,83 +1023,8 @@ void SIFoldOperandsImpl::foldOperand( // Remove kill flags as kills may now be out of order with uses. MRI->clearKillFlags(OpToFold.getReg()); - - // That is very tricky to store a value into an AGPR. v_accvgpr_write_b32 - // can only accept VGPR or inline immediate. Recreate a reg_sequence with - // its initializers right here, so we will rematerialize immediates and - // avoid copies via different reg classes. - SmallVector, 32> Defs; - if (Size > 4 && TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) && - getRegSeqInit(Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32)) { - const DebugLoc &DL = UseMI->getDebugLoc(); - MachineBasicBlock &MBB = *UseMI->getParent(); - - UseMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE)); - for (unsigned I = UseMI->getNumOperands() - 1; I > 0; --I) - UseMI->removeOperand(I); - - MachineInstrBuilder B(*MBB.getParent(), UseMI); - DenseMap VGPRCopies; - SmallSetVector SeenAGPRs; - for (unsigned I = 0; I < Size / 4; ++I) { - MachineOperand *Def = Defs[I].first; - TargetInstrInfo::RegSubRegPair CopyToVGPR; - if (Def->isImm() && - TII->isInlineConstant(*Def, AMDGPU::OPERAND_REG_INLINE_C_INT32)) { - int64_t Imm = Def->getImm(); - - auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass); - BuildMI(MBB, UseMI, DL, - TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addImm(Imm); - B.addReg(Tmp); - } else if (Def->isReg() && TRI->isAGPR(*MRI, Def->getReg())) { - auto Src = getRegSubRegPair(*Def); - Def->setIsKill(false); - if (!SeenAGPRs.insert(Src)) { - // We cannot build a reg_sequence out of the same registers, they - // must be copied. Better do it here before copyPhysReg() created - // several reads to do the AGPR->VGPR->AGPR copy. - CopyToVGPR = Src; - } else { - B.addReg(Src.Reg, Def->isUndef() ? RegState::Undef : 0, - Src.SubReg); - } - } else { - assert(Def->isReg()); - Def->setIsKill(false); - auto Src = getRegSubRegPair(*Def); - - // Direct copy from SGPR to AGPR is not possible. To avoid creation - // of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later, - // create a copy here and track if we already have such a copy. - if (TRI->isSGPRReg(*MRI, Src.Reg)) { - CopyToVGPR = Src; - } else { - auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass); - BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Tmp).add(*Def); - B.addReg(Tmp); - } - } - - if (CopyToVGPR.Reg) { - auto [It, Inserted] = VGPRCopies.try_emplace(CopyToVGPR); - Register &Vgpr = It->second; - if (Inserted) { - Vgpr = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); - BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Vgpr).add(*Def); - } - auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass); - BuildMI(MBB, UseMI, DL, - TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addReg(Vgpr); - B.addReg(Tmp); - } - - B.addImm(Defs[I].second); - } - LLVM_DEBUG(dbgs() << "Folded " << *UseMI); - } - - return; + if (foldCopyToAGPRRegSequence(UseMI)) + return; } unsigned UseOpc = UseMI->getOpcode(); @@ -1558,6 +1484,92 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI, return true; } +/// Fold %agpr = COPY (REG_SEQUENCE x_MOV_B32, ...) into REG_SEQUENCE +/// (V_ACCVGPR_WRITE_B32_e64) ... depending on the reg_sequence input values. +bool SIFoldOperandsImpl::foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const { + // It is very tricky to store a value into an AGPR. v_accvgpr_write_b32 can + // only accept VGPR or inline immediate. Recreate a reg_sequence with its + // initializers right here, so we will rematerialize immediates and avoid + // copies via different reg classes. + if (!TRI->isAGPR(*MRI, CopyMI->getOperand(0).getReg())) + return false; + unsigned Size = TII->getOpSize(*CopyMI, 1); + if (Size <= 4) + return false; + + Register UseReg = CopyMI->getOperand(1).getReg(); + SmallVector, 32> Defs; + if (!getRegSeqInit(Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32)) + return false; + + const DebugLoc &DL = CopyMI->getDebugLoc(); + MachineBasicBlock &MBB = *CopyMI->getParent(); + + CopyMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE)); + for (unsigned I = CopyMI->getNumOperands() - 1; I > 0; --I) + CopyMI->removeOperand(I); + + MachineInstrBuilder B(*MBB.getParent(), CopyMI); + DenseMap VGPRCopies; + SmallSetVector SeenAGPRs; + for (unsigned I = 0; I < Size / 4; ++I) { + MachineOperand *Def = Defs[I].first; + TargetInstrInfo::RegSubRegPair CopyToVGPR; + if (Def->isImm() && + TII->isInlineConstant(*Def, AMDGPU::OPERAND_REG_INLINE_C_INT32)) { + int64_t Imm = Def->getImm(); + + auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass); + BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp) + .addImm(Imm); + B.addReg(Tmp); + } else if (Def->isReg() && TRI->isAGPR(*MRI, Def->getReg())) { + auto Src = getRegSubRegPair(*Def); + Def->setIsKill(false); + if (!SeenAGPRs.insert(Src)) { + // We cannot build a reg_sequence out of the same registers, they + // must be copied. Better do it here before copyPhysReg() created + // several reads to do the AGPR->VGPR->AGPR copy. + CopyToVGPR = Src; + } else { + B.addReg(Src.Reg, Def->isUndef() ? RegState::Undef : 0, Src.SubReg); + } + } else { + assert(Def->isReg()); + Def->setIsKill(false); + auto Src = getRegSubRegPair(*Def); + + // Direct copy from SGPR to AGPR is not possible. To avoid creation + // of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later, + // create a copy here and track if we already have such a copy. + if (TRI->isSGPRReg(*MRI, Src.Reg)) { + CopyToVGPR = Src; + } else { + auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass); + BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::COPY), Tmp).add(*Def); + B.addReg(Tmp); + } + } + + if (CopyToVGPR.Reg) { + auto [It, Inserted] = VGPRCopies.try_emplace(CopyToVGPR); + Register &Vgpr = It->second; + if (Inserted) { + Vgpr = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::COPY), Vgpr).add(*Def); + } + auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass); + BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp) + .addReg(Vgpr); + B.addReg(Tmp); + } + + B.addImm(Defs[I].second); + } + LLVM_DEBUG(dbgs() << "Folded " << *CopyMI); + return true; +} + bool SIFoldOperandsImpl::tryFoldFoldableCopy( MachineInstr &MI, MachineOperand *&CurrentKnownM0Val) const { Register DstReg = MI.getOperand(0).getReg(); From c78690f7db23a78c02a1cb2533a80549af8ad634 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 27 Feb 2025 09:43:14 +0700 Subject: [PATCH 2/3] Remove unnecessary check for register size --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 0e41a78c2c8ae..85a1c5d83c3c2 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1493,10 +1493,6 @@ bool SIFoldOperandsImpl::foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const { // copies via different reg classes. if (!TRI->isAGPR(*MRI, CopyMI->getOperand(0).getReg())) return false; - unsigned Size = TII->getOpSize(*CopyMI, 1); - if (Size <= 4) - return false; - Register UseReg = CopyMI->getOperand(1).getReg(); SmallVector, 32> Defs; if (!getRegSeqInit(Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32)) @@ -1512,7 +1508,7 @@ bool SIFoldOperandsImpl::foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const { MachineInstrBuilder B(*MBB.getParent(), CopyMI); DenseMap VGPRCopies; SmallSetVector SeenAGPRs; - for (unsigned I = 0; I < Size / 4; ++I) { + for (unsigned I = 0, NumElts = Defs.size(); I != NumElts; ++I) { MachineOperand *Def = Defs[I].first; TargetInstrInfo::RegSubRegPair CopyToVGPR; if (Def->isImm() && From d91f9722a38ef5125bb37b3740902abba60f6e92 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 27 Feb 2025 10:12:35 +0700 Subject: [PATCH 3/3] Use UseReg consistently --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 85a1c5d83c3c2..f1ba199fbae3f 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1022,7 +1022,7 @@ void SIFoldOperandsImpl::foldOperand( OpToFold.setIsKill(false); // Remove kill flags as kills may now be out of order with uses. - MRI->clearKillFlags(OpToFold.getReg()); + MRI->clearKillFlags(UseReg); if (foldCopyToAGPRRegSequence(UseMI)) return; }