diff --git a/llvm/lib/Target/AMDGPU/AMDGPULaneMaskUtils.h b/llvm/lib/Target/AMDGPU/AMDGPULaneMaskUtils.h new file mode 100644 index 0000000000000..df80196d95176 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPULaneMaskUtils.h @@ -0,0 +1,89 @@ +//===- AMDGPULaneMaskUtils.h - Exec/lane mask helper functions -*- C++ -*--===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEMASKUTILS_H +#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEMASKUTILS_H + +#include "GCNSubtarget.h" +#include "llvm/CodeGen/Register.h" + +namespace llvm { + +class GCNSubtarget; + +namespace AMDGPU { + +class LaneMaskConstants { +public: + const Register ExecReg; + const Register VccReg; + const unsigned AndOpc; + const unsigned AndTermOpc; + const unsigned AndN2Opc; + const unsigned AndN2SaveExecOpc; + const unsigned AndN2TermOpc; + const unsigned AndSaveExecOpc; + const unsigned AndSaveExecTermOpc; + const unsigned BfmOpc; + const unsigned CMovOpc; + const unsigned CSelectOpc; + const unsigned MovOpc; + const unsigned MovTermOpc; + const unsigned OrOpc; + const unsigned OrTermOpc; + const unsigned OrSaveExecOpc; + const unsigned XorOpc; + const unsigned XorTermOpc; + const unsigned WQMOpc; + + constexpr LaneMaskConstants(bool IsWave32) + : ExecReg(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC), + VccReg(IsWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC), + AndOpc(IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64), + AndTermOpc(IsWave32 ? AMDGPU::S_AND_B32_term : AMDGPU::S_AND_B64_term), + AndN2Opc(IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64), + AndN2SaveExecOpc(IsWave32 ? AMDGPU::S_ANDN2_SAVEEXEC_B32 + : AMDGPU::S_ANDN2_SAVEEXEC_B64), + AndN2TermOpc(IsWave32 ? AMDGPU::S_ANDN2_B32_term + : AMDGPU::S_ANDN2_B64_term), + AndSaveExecOpc(IsWave32 ? AMDGPU::S_AND_SAVEEXEC_B32 + : AMDGPU::S_AND_SAVEEXEC_B64), + AndSaveExecTermOpc(IsWave32 ? AMDGPU::S_AND_SAVEEXEC_B32_term + : AMDGPU::S_AND_SAVEEXEC_B64_term), + BfmOpc(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), + CMovOpc(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64), + CSelectOpc(IsWave32 ? AMDGPU::S_CSELECT_B32 : AMDGPU::S_CSELECT_B64), + MovOpc(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), + MovTermOpc(IsWave32 ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term), + OrOpc(IsWave32 ? AMDGPU::S_OR_B32 : AMDGPU::S_OR_B64), + OrTermOpc(IsWave32 ? AMDGPU::S_OR_B32_term : AMDGPU::S_OR_B64_term), + OrSaveExecOpc(IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 + : AMDGPU::S_OR_SAVEEXEC_B64), + XorOpc(IsWave32 ? AMDGPU::S_XOR_B32 : AMDGPU::S_XOR_B64), + XorTermOpc(IsWave32 ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term), + WQMOpc(IsWave32 ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64) {} + + static inline const LaneMaskConstants &get(const GCNSubtarget &ST); +}; + +static constexpr LaneMaskConstants LaneMaskConstants32 = + LaneMaskConstants(/*IsWave32=*/true); +static constexpr LaneMaskConstants LaneMaskConstants64 = + LaneMaskConstants(/*IsWave32=*/false); + +inline const LaneMaskConstants &LaneMaskConstants::get(const GCNSubtarget &ST) { + unsigned WavefrontSize = ST.getWavefrontSize(); + assert(WavefrontSize == 32 || WavefrontSize == 64); + return WavefrontSize == 32 ? LaneMaskConstants32 : LaneMaskConstants64; +} + +} // end namespace AMDGPU + +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEMASKUTILS_H diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 36b27bef350ed..a74d56f0a6781 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -73,6 +73,7 @@ #include "AMDGPU.h" #include "AMDGPUGlobalISelUtils.h" #include "AMDGPUInstrInfo.h" +#include "AMDGPULaneMaskUtils.h" #include "GCNSubtarget.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" @@ -783,17 +784,8 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( MachineFunction *MF = &B.getMF(); const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass(); - const unsigned MovExecOpc = - Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - const unsigned MovExecTermOpc = - Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term; - - const unsigned XorTermOpc = Subtarget.isWave32() ? - AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; - const unsigned AndSaveExecOpc = Subtarget.isWave32() ? - AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; - const unsigned ExecReg = Subtarget.isWave32() ? - AMDGPU::EXEC_LO : AMDGPU::EXEC; + const AMDGPU::LaneMaskConstants &LMC = + AMDGPU::LaneMaskConstants::get(Subtarget); #ifndef NDEBUG const int OrigRangeSize = std::distance(Range.begin(), Range.end()); @@ -941,19 +933,19 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( MRI.setRegClass(CondReg, WaveRC); // Update EXEC, save the original EXEC value to VCC. - B.buildInstr(AndSaveExecOpc) - .addDef(NewExec) - .addReg(CondReg, RegState::Kill); + B.buildInstr(LMC.AndSaveExecOpc) + .addDef(NewExec) + .addReg(CondReg, RegState::Kill); MRI.setSimpleHint(NewExec, CondReg); B.setInsertPt(*BodyBB, BodyBB->end()); // Update EXEC, switch all done bits to 0 and all todo bits to 1. - B.buildInstr(XorTermOpc) - .addDef(ExecReg) - .addReg(ExecReg) - .addReg(NewExec); + B.buildInstr(LMC.XorTermOpc) + .addDef(LMC.ExecReg) + .addReg(LMC.ExecReg) + .addReg(NewExec); // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use // s_cbranch_scc0? @@ -962,14 +954,12 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB); // Save the EXEC mask before the loop. - BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg) - .addReg(ExecReg); + BuildMI(MBB, MBB.end(), DL, TII->get(LMC.MovOpc), SaveExecReg) + .addReg(LMC.ExecReg); // Restore the EXEC mask after the loop. B.setMBB(*RestoreExecBB); - B.buildInstr(MovExecTermOpc) - .addDef(ExecReg) - .addReg(SaveExecReg); + B.buildInstr(LMC.MovTermOpc).addDef(LMC.ExecReg).addReg(SaveExecReg); // Set the insert point after the original instruction, so any new // instructions will be in the remainder. diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index 6533d4c8eca35..7793907c032d2 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -66,6 +66,7 @@ #include "SIFixSGPRCopies.h" #include "AMDGPU.h" +#include "AMDGPULaneMaskUtils.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/CodeGen/MachineDominators.h" @@ -1145,7 +1146,8 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) { } void SIFixSGPRCopies::fixSCCCopies(MachineFunction &MF) { - bool IsWave32 = MF.getSubtarget().isWave32(); + const AMDGPU::LaneMaskConstants &LMC = + AMDGPU::LaneMaskConstants::get(MF.getSubtarget()); for (MachineBasicBlock &MBB : MF) { for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; ++I) { @@ -1159,10 +1161,7 @@ void SIFixSGPRCopies::fixSCCCopies(MachineFunction &MF) { Register SCCCopy = MRI->createVirtualRegister(TRI->getWaveMaskRegClass()); I = BuildMI(*MI.getParent(), std::next(MachineBasicBlock::iterator(MI)), - MI.getDebugLoc(), - TII->get(IsWave32 ? AMDGPU::S_CSELECT_B32 - : AMDGPU::S_CSELECT_B64), - SCCCopy) + MI.getDebugLoc(), TII->get(LMC.CSelectOpc), SCCCopy) .addImm(-1) .addImm(0); I = BuildMI(*MI.getParent(), std::next(I), I->getDebugLoc(), @@ -1172,14 +1171,12 @@ void SIFixSGPRCopies::fixSCCCopies(MachineFunction &MF) { continue; } if (DstReg == AMDGPU::SCC) { - unsigned Opcode = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; - Register Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; Register Tmp = MRI->createVirtualRegister(TRI->getBoolRC()); I = BuildMI(*MI.getParent(), std::next(MachineBasicBlock::iterator(MI)), - MI.getDebugLoc(), TII->get(Opcode)) + MI.getDebugLoc(), TII->get(LMC.AndOpc)) .addReg(Tmp, getDefRegState(true)) .addReg(SrcReg) - .addReg(Exec); + .addReg(LMC.ExecReg); MI.eraseFromParent(); } } diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index ce25bf499c41e..7c5d4fc2dacf6 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -8,6 +8,7 @@ #include "SIFrameLowering.h" #include "AMDGPU.h" +#include "AMDGPULaneMaskUtils.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIMachineFunctionInfo.h" @@ -984,6 +985,7 @@ void SIFrameLowering::emitCSRSpillStores( const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo &TRI = TII->getRegisterInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); + const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST); // Spill Whole-Wave Mode VGPRs. Save only the inactive lanes of the scratch // registers. However, save all lanes of callee-saved VGPRs. Due to this, we @@ -1015,8 +1017,7 @@ void SIFrameLowering::emitCSRSpillStores( StoreWWMRegisters(WWMScratchRegs); auto EnableAllLanes = [&]() { - unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1); + BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addImm(-1); }; if (!WWMCalleeSavedRegs.empty()) { @@ -1043,8 +1044,7 @@ void SIFrameLowering::emitCSRSpillStores( TII->getWholeWaveFunctionSetup(MF)->eraseFromParent(); } else if (ScratchExecCopy) { // FIXME: Split block and make terminator. - unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec()) + BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg) .addReg(ScratchExecCopy, RegState::Kill); LiveUnits.addReg(ScratchExecCopy); } @@ -1092,6 +1092,7 @@ void SIFrameLowering::emitCSRSpillRestores( const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo &TRI = TII->getRegisterInfo(); + const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST); Register FramePtrReg = FuncInfo->getFrameOffsetReg(); for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) { @@ -1147,16 +1148,14 @@ void SIFrameLowering::emitCSRSpillRestores( Register OrigExec = Return.getOperand(0).getReg(); if (!WWMScratchRegs.empty()) { - unsigned XorOpc = ST.isWave32() ? AMDGPU::S_XOR_B32 : AMDGPU::S_XOR_B64; - BuildMI(MBB, MBBI, DL, TII->get(XorOpc), TRI.getExec()) + BuildMI(MBB, MBBI, DL, TII->get(LMC.XorOpc), LMC.ExecReg) .addReg(OrigExec) .addImm(-1); RestoreWWMRegisters(WWMScratchRegs); } // Restore original EXEC. - unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addReg(OrigExec); + BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addReg(OrigExec); // Drop the first operand and update the opcode. Return.removeOperand(0); @@ -1173,8 +1172,7 @@ void SIFrameLowering::emitCSRSpillRestores( RestoreWWMRegisters(WWMScratchRegs); if (!WWMCalleeSavedRegs.empty()) { if (ScratchExecCopy) { - unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1); + BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addImm(-1); } else { ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, /*IsProlog*/ false, @@ -1185,8 +1183,7 @@ void SIFrameLowering::emitCSRSpillRestores( RestoreWWMRegisters(WWMCalleeSavedRegs); if (ScratchExecCopy) { // FIXME: Split block and make terminator. - unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec()) + BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg) .addReg(ScratchExecCopy, RegState::Kill); } } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 9acc4b6de3501..6a4df5eeb9779 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -14,6 +14,7 @@ #include "SIISelLowering.h" #include "AMDGPU.h" #include "AMDGPUInstrInfo.h" +#include "AMDGPULaneMaskUtils.h" #include "AMDGPUTargetMachine.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" @@ -5027,6 +5028,7 @@ emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineFunction *MF = OrigBB.getParent(); const GCNSubtarget &ST = MF->getSubtarget(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST); MachineBasicBlock::iterator I = LoopBB.begin(); const TargetRegisterClass *BoolRC = TRI->getBoolRC(); @@ -5058,10 +5060,7 @@ emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, .addReg(Idx.getReg(), 0, Idx.getSubReg()); // Update EXEC, save the original EXEC value to VCC. - BuildMI(LoopBB, I, DL, - TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 - : AMDGPU::S_AND_SAVEEXEC_B64), - NewExec) + BuildMI(LoopBB, I, DL, TII->get(LMC.AndSaveExecOpc), NewExec) .addReg(CondReg, RegState::Kill); MRI.setSimpleHint(NewExec, CondReg); @@ -5088,13 +5087,9 @@ emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, } // Update EXEC, switch all done bits to 0 and all todo bits to 1. - unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; MachineInstr *InsertPt = - BuildMI(LoopBB, I, DL, - TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term - : AMDGPU::S_XOR_B64_term), - Exec) - .addReg(Exec) + BuildMI(LoopBB, I, DL, TII->get(LMC.XorTermOpc), LMC.ExecReg) + .addReg(LMC.ExecReg) .addReg(NewExec); // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use @@ -5129,15 +5124,14 @@ loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, Register DstReg = MI.getOperand(0).getReg(); Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); Register TmpExec = MRI.createVirtualRegister(BoolXExecRC); - unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST); BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec); // Save the EXEC mask // clang-format off - BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec) - .addReg(Exec); + BuildMI(MBB, I, DL, TII->get(LMC.MovOpc), SaveExec) + .addReg(LMC.ExecReg); // clang-format on auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false); @@ -5157,7 +5151,7 @@ loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, LoopBB->addSuccessor(LandingPad); MachineBasicBlock::iterator First = LandingPad->begin(); // clang-format off - BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec) + BuildMI(*LandingPad, First, DL, TII->get(LMC.MovOpc), LMC.ExecReg) .addReg(SaveExec); // clang-format on diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 70223da961e92..5106478a95b43 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -14,6 +14,7 @@ #include "SIInstrInfo.h" #include "AMDGPU.h" #include "AMDGPUInstrInfo.h" +#include "AMDGPULaneMaskUtils.h" #include "GCNHazardRecognizer.h" #include "GCNSubtarget.h" #include "SIMachineFunctionInfo.h" @@ -1195,6 +1196,7 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, Register FalseReg) const { MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); const TargetRegisterClass *BoolXExecRC = RI.getWaveMaskRegClass(); + const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST); assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass && "Not a VGPR32 reg"); @@ -1213,10 +1215,7 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, switch (Cond[0].getImm()) { case SIInstrInfo::SCC_TRUE: { Register SReg = MRI.createVirtualRegister(BoolXExecRC); - BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 - : AMDGPU::S_CSELECT_B64), SReg) - .addImm(1) - .addImm(0); + BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0); BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) .addImm(0) .addReg(FalseReg) @@ -1227,10 +1226,7 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, } case SIInstrInfo::SCC_FALSE: { Register SReg = MRI.createVirtualRegister(BoolXExecRC); - BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 - : AMDGPU::S_CSELECT_B64), SReg) - .addImm(0) - .addImm(1); + BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1); BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) .addImm(0) .addReg(FalseReg) @@ -1270,13 +1266,8 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, case SIInstrInfo::EXECNZ: { Register SReg = MRI.createVirtualRegister(BoolXExecRC); Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC()); - BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 - : AMDGPU::S_OR_SAVEEXEC_B64), SReg2) - .addImm(0); - BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 - : AMDGPU::S_CSELECT_B64), SReg) - .addImm(1) - .addImm(0); + BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0); + BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0); BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) .addImm(0) .addReg(FalseReg) @@ -1288,13 +1279,8 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, case SIInstrInfo::EXECZ: { Register SReg = MRI.createVirtualRegister(BoolXExecRC); Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC()); - BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 - : AMDGPU::S_OR_SAVEEXEC_B64), SReg2) - .addImm(0); - BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 - : AMDGPU::S_CSELECT_B64), SReg) - .addImm(0) - .addImm(1); + BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0); + BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1); BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) .addImm(0) .addReg(FalseReg) @@ -2046,6 +2032,7 @@ unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) { bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MBB.findDebugLoc(MI); + const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST); switch (MI.getOpcode()) { default: return TargetInstrInfo::expandPostRAPseudo(MI); case AMDGPU::S_MOV_B64_term: @@ -2470,18 +2457,15 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case AMDGPU::ENTER_STRICT_WWM: { // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when // Whole Wave Mode is entered. - MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 - : AMDGPU::S_OR_SAVEEXEC_B64)); + MI.setDesc(get(LMC.OrSaveExecOpc)); break; } case AMDGPU::ENTER_STRICT_WQM: { // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when // STRICT_WQM is entered. - const unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - const unsigned WQMOp = ST.isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64; - const unsigned MovOp = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - BuildMI(MBB, MI, DL, get(MovOp), MI.getOperand(0).getReg()).addReg(Exec); - BuildMI(MBB, MI, DL, get(WQMOp), Exec).addReg(Exec); + BuildMI(MBB, MI, DL, get(LMC.MovOpc), MI.getOperand(0).getReg()) + .addReg(LMC.ExecReg); + BuildMI(MBB, MI, DL, get(LMC.WQMOpc), LMC.ExecReg).addReg(LMC.ExecReg); MI.eraseFromParent(); break; @@ -2490,7 +2474,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case AMDGPU::EXIT_STRICT_WQM: { // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when // WWM/STICT_WQM is exited. - MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64)); + MI.setDesc(get(LMC.MovOpc)); break; } case AMDGPU::SI_RETURN: { @@ -5923,25 +5907,22 @@ void SIInstrInfo::insertScratchExecCopy(MachineFunction &MF, SlotIndexes *Indexes) const { const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); - bool IsWave32 = ST.isWave32(); + const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST); if (IsSCCLive) { // Insert two move instructions, one to save the original value of EXEC and // the other to turn on all bits in EXEC. This is required as we can't use // the single instruction S_OR_SAVEEXEC that clobbers SCC. - unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - MCRegister Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg) - .addReg(Exec, RegState::Kill); - auto FlipExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1); + auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), Reg) + .addReg(LMC.ExecReg, RegState::Kill); + auto FlipExecMI = + BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addImm(-1); if (Indexes) { Indexes->insertMachineInstrInMaps(*StoreExecMI); Indexes->insertMachineInstrInMaps(*FlipExecMI); } } else { - const unsigned OrSaveExec = - IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64; auto SaveExec = - BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), Reg).addImm(-1); + BuildMI(MBB, MBBI, DL, TII->get(LMC.OrSaveExecOpc), Reg).addImm(-1); SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead. if (Indexes) Indexes->insertMachineInstrInMaps(*SaveExec); @@ -5952,10 +5933,9 @@ void SIInstrInfo::restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, SlotIndexes *Indexes) const { - unsigned ExecMov = isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - MCRegister Exec = isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - auto ExecRestoreMI = - BuildMI(MBB, MBBI, DL, get(ExecMov), Exec).addReg(Reg, RegState::Kill); + const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST); + auto ExecRestoreMI = BuildMI(MBB, MBBI, DL, get(LMC.MovOpc), LMC.ExecReg) + .addReg(Reg, RegState::Kill); if (Indexes) Indexes->insertMachineInstrInMaps(*ExecRestoreMI); } @@ -6812,13 +6792,7 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineFunction &MF = *LoopBB.getParent(); const GCNSubtarget &ST = MF.getSubtarget(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); - unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - unsigned SaveExecOpc = - ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; - unsigned XorTermOpc = - ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; - unsigned AndOpc = - ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; + const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST); const auto *BoolXExecRC = TRI->getWaveMaskRegClass(); MachineBasicBlock::iterator I = LoopBB.begin(); @@ -6846,7 +6820,7 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, CondReg = NewCondReg; else { // If not the first, we create an AND. Register AndReg = MRI.createVirtualRegister(BoolXExecRC); - BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg) + BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg) .addReg(CondReg) .addReg(NewCondReg); CondReg = AndReg; @@ -6902,7 +6876,7 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, CondReg = NewCondReg; else { // If not the first, we create an AND. Register AndReg = MRI.createVirtualRegister(BoolXExecRC); - BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg) + BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg) .addReg(CondReg) .addReg(NewCondReg); CondReg = AndReg; @@ -6931,15 +6905,15 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MRI.setSimpleHint(SaveExec, CondReg); // Update EXEC to matching lanes, saving original to SaveExec. - BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec) + BuildMI(LoopBB, I, DL, TII.get(LMC.AndSaveExecOpc), SaveExec) .addReg(CondReg, RegState::Kill); // The original instruction is here; we insert the terminators after it. I = BodyBB.end(); // Update EXEC, switch all done bits to 0 and all todo bits to 1. - BuildMI(BodyBB, I, DL, TII.get(XorTermOpc), Exec) - .addReg(Exec) + BuildMI(BodyBB, I, DL, TII.get(LMC.XorTermOpc), LMC.ExecReg) + .addReg(LMC.ExecReg) .addReg(SaveExec); BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB); @@ -6966,8 +6940,7 @@ loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, ++End; } const DebugLoc &DL = MI.getDebugLoc(); - unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST); const auto *BoolXExecRC = TRI->getWaveMaskRegClass(); // Save SCC. Waterfall Loop may overwrite SCC. @@ -6989,7 +6962,7 @@ loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); // Save the EXEC mask - BuildMI(MBB, Begin, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec); + BuildMI(MBB, Begin, DL, TII.get(LMC.MovOpc), SaveExec).addReg(LMC.ExecReg); // Killed uses in the instruction we are waterfalling around will be // incorrect due to the added control-flow. @@ -7050,7 +7023,8 @@ loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, } // Restore the EXEC mask - BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec); + BuildMI(*RemainderBB, First, DL, TII.get(LMC.MovOpc), LMC.ExecReg) + .addReg(SaveExec); return BodyBB; } @@ -7745,12 +7719,10 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, // Clear unused bits of vcc Register CondReg = Inst.getOperand(1).getReg(); bool IsSCC = CondReg == AMDGPU::SCC; - Register VCC = RI.getVCC(); - Register EXEC = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - unsigned Opc = ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; - BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(Opc), VCC) - .addReg(EXEC) - .addReg(IsSCC ? VCC : CondReg); + const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST); + BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(LMC.AndOpc), LMC.VccReg) + .addReg(LMC.ExecReg) + .addReg(IsSCC ? LMC.VccReg : CondReg); Inst.removeOperand(1); } break; @@ -10203,9 +10175,7 @@ MachineInstr *SIInstrInfo::createPHISourceCopy( InsPt->definesRegister(Src, /*TRI=*/nullptr)) { InsPt++; return BuildMI(MBB, InsPt, DL, - get(ST.isWave32() ? AMDGPU::S_MOV_B32_term - : AMDGPU::S_MOV_B64_term), - Dst) + get(AMDGPU::LaneMaskConstants::get(ST).MovTermOpc), Dst) .addReg(Src, 0, SrcSubReg) .addReg(AMDGPU::EXEC, RegState::Implicit); } diff --git a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp index 73a2d0a56aebe..6537b79d58021 100644 --- a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "AMDGPULaneMaskUtils.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIMachineFunctionInfo.h" @@ -27,21 +28,22 @@ namespace { class SILateBranchLowering { private: - const SIRegisterInfo *TRI = nullptr; - const SIInstrInfo *TII = nullptr; - MachineDominatorTree *MDT = nullptr; + const GCNSubtarget &ST; + const SIInstrInfo *TII; + const SIRegisterInfo *TRI; + MachineDominatorTree *MDT; + const AMDGPU::LaneMaskConstants &LMC; void expandChainCall(MachineInstr &MI, const GCNSubtarget &ST, bool DynamicVGPR); void earlyTerm(MachineInstr &MI, MachineBasicBlock *EarlyExitBlock); public: - SILateBranchLowering(MachineDominatorTree *MDT) : MDT(MDT) {} + SILateBranchLowering(const GCNSubtarget &ST, MachineDominatorTree *MDT) + : ST(ST), TII(ST.getInstrInfo()), TRI(&TII->getRegisterInfo()), MDT(MDT), + LMC(AMDGPU::LaneMaskConstants::get(ST)) {} bool run(MachineFunction &MF); - - unsigned MovOpc; - Register ExecReg; }; class SILateBranchLoweringLegacy : public MachineFunctionPass { @@ -50,8 +52,9 @@ class SILateBranchLoweringLegacy : public MachineFunctionPass { SILateBranchLoweringLegacy() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override { + const GCNSubtarget &ST = MF.getSubtarget(); auto *MDT = &getAnalysis().getDomTree(); - return SILateBranchLowering(MDT).run(MF); + return SILateBranchLowering(ST, MDT).run(MF); } StringRef getPassName() const override { @@ -166,17 +169,16 @@ void SILateBranchLowering::expandChainCall(MachineInstr &MI, copyOpWithoutRegFlags(SelectCallee, *TII->getNamedOperand(MI, AMDGPU::OpName::fbcallee)); - auto SelectExec = BuildMI(*MI.getParent(), MI, DL, - TII->get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 - : AMDGPU::S_CSELECT_B64)) - .addDef(ExecReg); + auto SelectExec = BuildMI(*MI.getParent(), MI, DL, TII->get(LMC.CSelectOpc)) + .addDef(LMC.ExecReg); copyOpWithoutRegFlags(SelectExec, *TII->getNamedOperand(MI, AMDGPU::OpName::exec)); copyOpWithoutRegFlags(SelectExec, *TII->getNamedOperand(MI, AMDGPU::OpName::fbexec)); } else { - auto SetExec = BuildMI(*MI.getParent(), MI, DL, TII->get(MovOpc), ExecReg); + auto SetExec = + BuildMI(*MI.getParent(), MI, DL, TII->get(LMC.MovOpc), LMC.ExecReg); copyOpWithoutRegFlags(SetExec, *TII->getNamedOperand(MI, AMDGPU::OpName::exec)); } @@ -206,8 +208,9 @@ void SILateBranchLowering::earlyTerm(MachineInstr &MI, PreservedAnalyses llvm::SILateBranchLoweringPass::run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM) { + const GCNSubtarget &ST = MF.getSubtarget(); auto *MDT = &MFAM.getResult(MF); - if (!SILateBranchLowering(MDT).run(MF)) + if (!SILateBranchLowering(ST, MDT).run(MF)) return PreservedAnalyses::all(); return getMachineFunctionPassPreservedAnalyses() @@ -215,13 +218,6 @@ llvm::SILateBranchLoweringPass::run(MachineFunction &MF, } bool SILateBranchLowering::run(MachineFunction &MF) { - const GCNSubtarget &ST = MF.getSubtarget(); - TII = ST.getInstrInfo(); - TRI = &TII->getRegisterInfo(); - - MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - ExecReg = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - SmallVector EarlyTermInstrs; SmallVector EpilogInstrs; bool MadeChange = false; @@ -270,8 +266,8 @@ bool SILateBranchLowering::run(MachineFunction &MF) { DebugLoc DL; MF.insert(MF.end(), EarlyExitBlock); - BuildMI(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII->get(MovOpc), - ExecReg) + BuildMI(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII->get(LMC.MovOpc), + LMC.ExecReg) .addImm(0); generateEndPgm(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII, MF); diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp index e97536d36bab2..115a020f44098 100644 --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -50,6 +50,7 @@ #include "SILowerControlFlow.h" #include "AMDGPU.h" +#include "AMDGPULaneMaskUtils.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/ADT/SmallSet.h" @@ -85,15 +86,7 @@ class SILowerControlFlow { SmallSet RecomputeRegs; const TargetRegisterClass *BoolRC = nullptr; - unsigned AndOpc; - unsigned OrOpc; - unsigned XorOpc; - unsigned MovTermOpc; - unsigned Andn2TermOpc; - unsigned XorTermrOpc; - unsigned OrTermrOpc; - unsigned OrSaveExecOpc; - unsigned Exec; + const AMDGPU::LaneMaskConstants &LMC; bool EnableOptimizeEndCf = false; @@ -139,9 +132,11 @@ class SILowerControlFlow { void optimizeEndCf(); public: - SILowerControlFlow(LiveIntervals *LIS, LiveVariables *LV, - MachineDominatorTree *MDT, MachinePostDominatorTree *PDT) - : LIS(LIS), LV(LV), MDT(MDT), PDT(PDT) {} + SILowerControlFlow(const GCNSubtarget *ST, LiveIntervals *LIS, + LiveVariables *LV, MachineDominatorTree *MDT, + MachinePostDominatorTree *PDT) + : LIS(LIS), LV(LV), MDT(MDT), PDT(PDT), + LMC(AMDGPU::LaneMaskConstants::get(*ST)) {} bool run(MachineFunction &MF); }; @@ -243,18 +238,15 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) { // will interfere with trying to form s_and_saveexec_b64 later. Register CopyReg = SimpleIf ? SaveExecReg : MRI->createVirtualRegister(BoolRC); - MachineInstr *CopyExec = - BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), CopyReg) - .addReg(Exec) - .addReg(Exec, RegState::ImplicitDefine); + MachineInstr *CopyExec = BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), CopyReg) + .addReg(LMC.ExecReg) + .addReg(LMC.ExecReg, RegState::ImplicitDefine); LoweredIf.insert(CopyReg); Register Tmp = MRI->createVirtualRegister(BoolRC); MachineInstr *And = - BuildMI(MBB, I, DL, TII->get(AndOpc), Tmp) - .addReg(CopyReg) - .add(Cond); + BuildMI(MBB, I, DL, TII->get(LMC.AndOpc), Tmp).addReg(CopyReg).add(Cond); if (LV) LV->replaceKillInstruction(Cond.getReg(), MI, *And); @@ -262,18 +254,17 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) { MachineInstr *Xor = nullptr; if (!SimpleIf) { - Xor = - BuildMI(MBB, I, DL, TII->get(XorOpc), SaveExecReg) - .addReg(Tmp) - .addReg(CopyReg); + Xor = BuildMI(MBB, I, DL, TII->get(LMC.XorOpc), SaveExecReg) + .addReg(Tmp) + .addReg(CopyReg); setImpSCCDefDead(*Xor, ImpDefSCC.isDead()); } // Use a copy that is a terminator to get correct spill code placement it with // fast regalloc. MachineInstr *SetExec = - BuildMI(MBB, I, DL, TII->get(MovTermOpc), Exec) - .addReg(Tmp, RegState::Kill); + BuildMI(MBB, I, DL, TII->get(LMC.MovTermOpc), LMC.ExecReg) + .addReg(Tmp, RegState::Kill); if (LV) LV->getVarInfo(Tmp).Kills.push_back(SetExec); @@ -327,8 +318,8 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) { // else. Register SaveReg = MRI->createVirtualRegister(BoolRC); MachineInstr *OrSaveExec = - BuildMI(MBB, Start, DL, TII->get(OrSaveExecOpc), SaveReg) - .add(MI.getOperand(1)); // Saved EXEC + BuildMI(MBB, Start, DL, TII->get(LMC.OrSaveExecOpc), SaveReg) + .add(MI.getOperand(1)); // Saved EXEC if (LV) LV->replaceKillInstruction(SrcReg, MI, *OrSaveExec); @@ -338,14 +329,14 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) { // This accounts for any modification of the EXEC mask within the block and // can be optimized out pre-RA when not required. - MachineInstr *And = BuildMI(MBB, ElsePt, DL, TII->get(AndOpc), DstReg) - .addReg(Exec) + MachineInstr *And = BuildMI(MBB, ElsePt, DL, TII->get(LMC.AndOpc), DstReg) + .addReg(LMC.ExecReg) .addReg(SaveReg); MachineInstr *Xor = - BuildMI(MBB, ElsePt, DL, TII->get(XorTermrOpc), Exec) - .addReg(Exec) - .addReg(DstReg); + BuildMI(MBB, ElsePt, DL, TII->get(LMC.XorTermOpc), LMC.ExecReg) + .addReg(LMC.ExecReg) + .addReg(DstReg); // Skip ahead to the unconditional branch in case there are other terminators // present. @@ -400,16 +391,16 @@ void SILowerControlFlow::emitIfBreak(MachineInstr &MI) { Register AndReg; if (!SkipAnding) { AndReg = MRI->createVirtualRegister(BoolRC); - And = BuildMI(MBB, &MI, DL, TII->get(AndOpc), AndReg) - .addReg(Exec) - .add(MI.getOperand(1)); + And = BuildMI(MBB, &MI, DL, TII->get(LMC.AndOpc), AndReg) + .addReg(LMC.ExecReg) + .add(MI.getOperand(1)); if (LV) LV->replaceKillInstruction(MI.getOperand(1).getReg(), MI, *And); - Or = BuildMI(MBB, &MI, DL, TII->get(OrOpc), Dst) + Or = BuildMI(MBB, &MI, DL, TII->get(LMC.OrOpc), Dst) .addReg(AndReg) .add(MI.getOperand(2)); } else { - Or = BuildMI(MBB, &MI, DL, TII->get(OrOpc), Dst) + Or = BuildMI(MBB, &MI, DL, TII->get(LMC.OrOpc), Dst) .add(MI.getOperand(1)) .add(MI.getOperand(2)); if (LV) @@ -436,8 +427,8 @@ void SILowerControlFlow::emitLoop(MachineInstr &MI) { const DebugLoc &DL = MI.getDebugLoc(); MachineInstr *AndN2 = - BuildMI(MBB, &MI, DL, TII->get(Andn2TermOpc), Exec) - .addReg(Exec) + BuildMI(MBB, &MI, DL, TII->get(LMC.AndN2TermOpc), LMC.ExecReg) + .addReg(LMC.ExecReg) .add(MI.getOperand(0)); if (LV) LV->replaceKillInstruction(MI.getOperand(0).getReg(), MI, *AndN2); @@ -505,7 +496,7 @@ MachineBasicBlock *SILowerControlFlow::emitEndCf(MachineInstr &MI) { } } - unsigned Opcode = OrOpc; + unsigned Opcode = LMC.OrOpc; MachineBasicBlock *SplitBB = &MBB; if (NeedBlockSplit) { SplitBB = MBB.splitAt(MI, /*UpdateLiveIns*/true, LIS); @@ -522,14 +513,13 @@ MachineBasicBlock *SILowerControlFlow::emitEndCf(MachineInstr &MI) { if (PDT) PDT->applyUpdates(DTUpdates); } - Opcode = OrTermrOpc; + Opcode = LMC.OrTermOpc; InsPt = MI; } - MachineInstr *NewMI = - BuildMI(MBB, InsPt, DL, TII->get(Opcode), Exec) - .addReg(Exec) - .add(MI.getOperand(0)); + MachineInstr *NewMI = BuildMI(MBB, InsPt, DL, TII->get(Opcode), LMC.ExecReg) + .addReg(LMC.ExecReg) + .add(MI.getOperand(0)); if (LV) { LV->replaceKillInstruction(DataReg, MI, *NewMI); @@ -597,12 +587,12 @@ void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo, // does not really modify exec. for (auto I = Def->getIterator(); I != MI.getIterator(); ++I) if (I->modifiesRegister(AMDGPU::EXEC, TRI) && - !(I->isCopy() && I->getOperand(0).getReg() != Exec)) + !(I->isCopy() && I->getOperand(0).getReg() != LMC.ExecReg)) return; for (const auto &SrcOp : Def->explicit_operands()) if (SrcOp.isReg() && SrcOp.isUse() && - (SrcOp.getReg().isVirtual() || SrcOp.getReg() == Exec)) + (SrcOp.getReg().isVirtual() || SrcOp.getReg() == LMC.ExecReg)) Src.push_back(SrcOp); } @@ -781,28 +771,6 @@ bool SILowerControlFlow::run(MachineFunction &MF) { MRI = &MF.getRegInfo(); BoolRC = TRI->getBoolRC(); - if (ST.isWave32()) { - AndOpc = AMDGPU::S_AND_B32; - OrOpc = AMDGPU::S_OR_B32; - XorOpc = AMDGPU::S_XOR_B32; - MovTermOpc = AMDGPU::S_MOV_B32_term; - Andn2TermOpc = AMDGPU::S_ANDN2_B32_term; - XorTermrOpc = AMDGPU::S_XOR_B32_term; - OrTermrOpc = AMDGPU::S_OR_B32_term; - OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32; - Exec = AMDGPU::EXEC_LO; - } else { - AndOpc = AMDGPU::S_AND_B64; - OrOpc = AMDGPU::S_OR_B64; - XorOpc = AMDGPU::S_XOR_B64; - MovTermOpc = AMDGPU::S_MOV_B64_term; - Andn2TermOpc = AMDGPU::S_ANDN2_B64_term; - XorTermrOpc = AMDGPU::S_XOR_B64_term; - OrTermrOpc = AMDGPU::S_OR_B64_term; - OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64; - Exec = AMDGPU::EXEC; - } - // Compute set of blocks with kills const bool CanDemote = MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS; @@ -876,6 +844,7 @@ bool SILowerControlFlow::run(MachineFunction &MF) { } bool SILowerControlFlowLegacy::runOnMachineFunction(MachineFunction &MF) { + const GCNSubtarget *ST = &MF.getSubtarget(); // This doesn't actually need LiveIntervals, but we can preserve them. auto *LISWrapper = getAnalysisIfAvailable(); LiveIntervals *LIS = LISWrapper ? &LISWrapper->getLIS() : nullptr; @@ -888,12 +857,13 @@ bool SILowerControlFlowLegacy::runOnMachineFunction(MachineFunction &MF) { getAnalysisIfAvailable(); MachinePostDominatorTree *PDT = PDTWrapper ? &PDTWrapper->getPostDomTree() : nullptr; - return SILowerControlFlow(LIS, LV, MDT, PDT).run(MF); + return SILowerControlFlow(ST, LIS, LV, MDT, PDT).run(MF); } PreservedAnalyses SILowerControlFlowPass::run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM) { + const GCNSubtarget *ST = &MF.getSubtarget(); LiveIntervals *LIS = MFAM.getCachedResult(MF); LiveVariables *LV = MFAM.getCachedResult(MF); MachineDominatorTree *MDT = @@ -901,7 +871,7 @@ SILowerControlFlowPass::run(MachineFunction &MF, MachinePostDominatorTree *PDT = MFAM.getCachedResult(MF); - bool Changed = SILowerControlFlow(LIS, LV, MDT, PDT).run(MF); + bool Changed = SILowerControlFlow(ST, LIS, LV, MDT, PDT).run(MF); if (!Changed) return PreservedAnalyses::all(); diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp index 745e4086bc7fe..aa028c850bd49 100644 --- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp @@ -8,6 +8,7 @@ #include "SIOptimizeExecMasking.h" #include "AMDGPU.h" +#include "AMDGPULaneMaskUtils.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIRegisterInfo.h" @@ -25,12 +26,20 @@ using namespace llvm; namespace { class SIOptimizeExecMasking { - MachineFunction *MF = nullptr; - const GCNSubtarget *ST = nullptr; - const SIRegisterInfo *TRI = nullptr; - const SIInstrInfo *TII = nullptr; - const MachineRegisterInfo *MRI = nullptr; - MCRegister Exec; +public: + SIOptimizeExecMasking(MachineFunction *MF) + : MF(MF), ST(&MF->getSubtarget()), TII(ST->getInstrInfo()), + TRI(&TII->getRegisterInfo()), MRI(&MF->getRegInfo()), + LMC(AMDGPU::LaneMaskConstants::get(*ST)) {} + bool run(); + +private: + MachineFunction *MF; + const GCNSubtarget *ST; + const SIInstrInfo *TII; + const SIRegisterInfo *TRI; + const MachineRegisterInfo *MRI; + const AMDGPU::LaneMaskConstants &LMC; DenseMap SaveExecVCmpMapping; SmallVector, 1> OrXors; @@ -57,13 +66,10 @@ class SIOptimizeExecMasking { bool optimizeExecSequence(); void tryRecordVCmpxAndSaveexecSequence(MachineInstr &MI); bool optimizeVCMPSaveExecSequence(MachineInstr &SaveExecInstr, - MachineInstr &VCmp, MCRegister Exec) const; + MachineInstr &VCmp) const; void tryRecordOrSaveexecXorSequence(MachineInstr &MI); bool optimizeOrSaveexecXorSequences(); - -public: - bool run(MachineFunction &MF); }; class SIOptimizeExecMaskingLegacy : public MachineFunctionPass { @@ -91,9 +97,9 @@ class SIOptimizeExecMaskingLegacy : public MachineFunctionPass { PreservedAnalyses SIOptimizeExecMaskingPass::run(MachineFunction &MF, MachineFunctionAnalysisManager &) { - SIOptimizeExecMasking Impl; + SIOptimizeExecMasking Impl(&MF); - if (!Impl.run(MF)) + if (!Impl.run()) return PreservedAnalyses::all(); auto PA = getMachineFunctionPassPreservedAnalyses(); @@ -120,7 +126,7 @@ Register SIOptimizeExecMasking::isCopyFromExec(const MachineInstr &MI) const { case AMDGPU::S_MOV_B32: case AMDGPU::S_MOV_B32_term: { const MachineOperand &Src = MI.getOperand(1); - if (Src.isReg() && Src.getReg() == Exec) + if (Src.isReg() && Src.getReg() == LMC.ExecReg) return MI.getOperand(0).getReg(); } } @@ -135,7 +141,7 @@ Register SIOptimizeExecMasking::isCopyToExec(const MachineInstr &MI) const { case AMDGPU::S_MOV_B64: case AMDGPU::S_MOV_B32: { const MachineOperand &Dst = MI.getOperand(0); - if (Dst.isReg() && Dst.getReg() == Exec && MI.getOperand(1).isReg()) + if (Dst.isReg() && Dst.getReg() == LMC.ExecReg && MI.getOperand(1).isReg()) return MI.getOperand(1).getReg(); break; } @@ -471,7 +477,7 @@ bool SIOptimizeExecMasking::optimizeExecSequence() { isLogicalOpOnExec(*PrepareExecInst) == CopyToExec) { LLVM_DEBUG(dbgs() << "Fold exec copy: " << *PrepareExecInst); - PrepareExecInst->getOperand(0).setReg(Exec); + PrepareExecInst->getOperand(0).setReg(LMC.ExecReg); LLVM_DEBUG(dbgs() << "into: " << *PrepareExecInst << '\n'); @@ -496,7 +502,7 @@ bool SIOptimizeExecMasking::optimizeExecSequence() { J = std::next(CopyFromExecInst->getIterator()), JE = I->getIterator(); J != JE; ++J) { - if (SaveExecInst && J->readsRegister(Exec, TRI)) { + if (SaveExecInst && J->readsRegister(LMC.ExecReg, TRI)) { LLVM_DEBUG(dbgs() << "exec read prevents saveexec: " << *J << '\n'); // Make sure this is inserted after any VALU ops that may have been // scheduled in between. @@ -580,8 +586,8 @@ bool SIOptimizeExecMasking::optimizeExecSequence() { CopyToExecInst->eraseFromParent(); for (MachineInstr *OtherInst : OtherUseInsts) { - OtherInst->substituteRegister(CopyToExec, Exec, AMDGPU::NoSubRegister, - *TRI); + OtherInst->substituteRegister(CopyToExec, LMC.ExecReg, + AMDGPU::NoSubRegister, *TRI); } Changed = true; @@ -593,7 +599,7 @@ bool SIOptimizeExecMasking::optimizeExecSequence() { // Inserts the optimized s_mov_b32 / v_cmpx sequence based on the // operands extracted from a v_cmp ..., s_and_saveexec pattern. bool SIOptimizeExecMasking::optimizeVCMPSaveExecSequence( - MachineInstr &SaveExecInstr, MachineInstr &VCmp, MCRegister Exec) const { + MachineInstr &SaveExecInstr, MachineInstr &VCmp) const { const int NewOpcode = AMDGPU::getVCMPXOpFromVCMP(VCmp.getOpcode()); if (NewOpcode == -1) @@ -610,7 +616,7 @@ bool SIOptimizeExecMasking::optimizeVCMPSaveExecSequence( unsigned MovOpcode = IsSGPR32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; BuildMI(*SaveExecInstr.getParent(), InsertPosIt, SaveExecInstr.getDebugLoc(), TII->get(MovOpcode), MoveDest) - .addReg(Exec); + .addReg(LMC.ExecReg); } // Omit dst as V_CMPX is implicitly writing to EXEC. @@ -661,10 +667,7 @@ void SIOptimizeExecMasking::tryRecordVCmpxAndSaveexecSequence( if (!ST->hasGFX10_3Insts()) return; - const unsigned AndSaveExecOpcode = - ST->isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; - - if (MI.getOpcode() != AndSaveExecOpcode) + if (MI.getOpcode() != LMC.AndSaveExecOpc) return; Register SaveExecDest = MI.getOperand(0).getReg(); @@ -690,7 +693,7 @@ void SIOptimizeExecMasking::tryRecordVCmpxAndSaveexecSequence( return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1 && Check->modifiesRegister(SaveExecSrc0->getReg(), TRI); }, - {Exec, SaveExecSrc0->getReg()}); + {LMC.ExecReg, SaveExecSrc0->getReg()}); if (!VCmp) return; @@ -748,32 +751,28 @@ void SIOptimizeExecMasking::tryRecordVCmpxAndSaveexecSequence( // to be replaced with // s_andn2_saveexec s_o, s_i. void SIOptimizeExecMasking::tryRecordOrSaveexecXorSequence(MachineInstr &MI) { - const unsigned XorOpcode = - ST->isWave32() ? AMDGPU::S_XOR_B32 : AMDGPU::S_XOR_B64; - - if (MI.getOpcode() == XorOpcode && &MI != &MI.getParent()->front()) { + if (MI.getOpcode() == LMC.XorOpc && &MI != &MI.getParent()->front()) { const MachineOperand &XorDst = MI.getOperand(0); const MachineOperand &XorSrc0 = MI.getOperand(1); const MachineOperand &XorSrc1 = MI.getOperand(2); - if (XorDst.isReg() && XorDst.getReg() == Exec && XorSrc0.isReg() && + if (XorDst.isReg() && XorDst.getReg() == LMC.ExecReg && XorSrc0.isReg() && XorSrc1.isReg() && - (XorSrc0.getReg() == Exec || XorSrc1.getReg() == Exec)) { - const unsigned OrSaveexecOpcode = ST->isWave32() - ? AMDGPU::S_OR_SAVEEXEC_B32 - : AMDGPU::S_OR_SAVEEXEC_B64; + (XorSrc0.getReg() == LMC.ExecReg || XorSrc1.getReg() == LMC.ExecReg)) { // Peek at the previous instruction and check if this is a relevant // s_or_saveexec instruction. MachineInstr &PossibleOrSaveexec = *MI.getPrevNode(); - if (PossibleOrSaveexec.getOpcode() != OrSaveexecOpcode) + if (PossibleOrSaveexec.getOpcode() != LMC.OrSaveExecOpc) return; const MachineOperand &OrDst = PossibleOrSaveexec.getOperand(0); const MachineOperand &OrSrc0 = PossibleOrSaveexec.getOperand(1); if (OrDst.isReg() && OrSrc0.isReg()) { - if ((XorSrc0.getReg() == Exec && XorSrc1.getReg() == OrDst.getReg()) || - (XorSrc0.getReg() == OrDst.getReg() && XorSrc1.getReg() == Exec)) { + if ((XorSrc0.getReg() == LMC.ExecReg && + XorSrc1.getReg() == OrDst.getReg()) || + (XorSrc0.getReg() == OrDst.getReg() && + XorSrc1.getReg() == LMC.ExecReg)) { OrXors.emplace_back(&PossibleOrSaveexec, &MI); } } @@ -787,15 +786,13 @@ bool SIOptimizeExecMasking::optimizeOrSaveexecXorSequences() { } bool Changed = false; - const unsigned Andn2Opcode = ST->isWave32() ? AMDGPU::S_ANDN2_SAVEEXEC_B32 - : AMDGPU::S_ANDN2_SAVEEXEC_B64; for (const auto &Pair : OrXors) { MachineInstr *Or = nullptr; MachineInstr *Xor = nullptr; std::tie(Or, Xor) = Pair; BuildMI(*Or->getParent(), Or->getIterator(), Or->getDebugLoc(), - TII->get(Andn2Opcode), Or->getOperand(0).getReg()) + TII->get(LMC.AndN2SaveExecOpc), Or->getOperand(0).getReg()) .addReg(Or->getOperand(1).getReg()); Or->eraseFromParent(); @@ -811,24 +808,17 @@ bool SIOptimizeExecMaskingLegacy::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; - return SIOptimizeExecMasking().run(MF); + return SIOptimizeExecMasking(&MF).run(); } -bool SIOptimizeExecMasking::run(MachineFunction &MF) { - this->MF = &MF; - ST = &MF.getSubtarget(); - TRI = ST->getRegisterInfo(); - TII = ST->getInstrInfo(); - MRI = &MF.getRegInfo(); - Exec = TRI->getExec(); - +bool SIOptimizeExecMasking::run() { bool Changed = optimizeExecSequence(); OrXors.clear(); SaveExecVCmpMapping.clear(); KillFlagCandidates.clear(); static unsigned SearchWindow = 10; - for (MachineBasicBlock &MBB : MF) { + for (MachineBasicBlock &MBB : *MF) { unsigned SearchCount = 0; for (auto &MI : llvm::reverse(MBB)) { @@ -842,7 +832,7 @@ bool SIOptimizeExecMasking::run(MachineFunction &MF) { tryRecordOrSaveexecXorSequence(MI); tryRecordVCmpxAndSaveexecSequence(MI); - if (MI.modifiesRegister(Exec, TRI)) { + if (MI.modifiesRegister(LMC.ExecReg, TRI)) { break; } @@ -855,7 +845,7 @@ bool SIOptimizeExecMasking::run(MachineFunction &MF) { MachineInstr *SaveExecInstr = Entry.getFirst(); MachineInstr *VCmpInstr = Entry.getSecond(); - Changed |= optimizeVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec); + Changed |= optimizeVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr); } return Changed; diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp index b2228574378f1..c186f5af78b7f 100644 --- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp @@ -14,6 +14,7 @@ #include "SIOptimizeExecMaskingPreRA.h" #include "AMDGPU.h" +#include "AMDGPULaneMaskUtils.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/CodeGen/LiveIntervals.h" @@ -28,15 +29,13 @@ namespace { class SIOptimizeExecMaskingPreRA { private: + const GCNSubtarget &ST; const SIRegisterInfo *TRI; const SIInstrInfo *TII; MachineRegisterInfo *MRI; LiveIntervals *LIS; + const AMDGPU::LaneMaskConstants &LMC; - unsigned AndOpc; - unsigned Andn2Opc; - unsigned OrSaveExecOpc; - unsigned XorTermrOpc; MCRegister CondReg; MCRegister ExecReg; @@ -44,7 +43,10 @@ class SIOptimizeExecMaskingPreRA { bool optimizeElseBranch(MachineBasicBlock &MBB); public: - SIOptimizeExecMaskingPreRA(LiveIntervals *LIS) : LIS(LIS) {} + SIOptimizeExecMaskingPreRA(MachineFunction &MF, LiveIntervals *LIS) + : ST(MF.getSubtarget()), TRI(ST.getRegisterInfo()), + TII(ST.getInstrInfo()), MRI(&MF.getRegInfo()), LIS(LIS), + LMC(AMDGPU::LaneMaskConstants::get(ST)) {} bool run(MachineFunction &MF); }; @@ -138,8 +140,8 @@ bool SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) { auto *And = TRI->findReachingDef(CondReg, AMDGPU::NoSubRegister, *I, *MRI, LIS); - if (!And || And->getOpcode() != AndOpc || - !And->getOperand(1).isReg() || !And->getOperand(2).isReg()) + if (!And || And->getOpcode() != LMC.AndOpc || !And->getOperand(1).isReg() || + !And->getOperand(2).isReg()) return false; MachineOperand *AndCC = &And->getOperand(1); @@ -207,7 +209,7 @@ bool SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) { << *And); MachineInstr *Andn2 = - BuildMI(MBB, *And, And->getDebugLoc(), TII->get(Andn2Opc), + BuildMI(MBB, *And, And->getDebugLoc(), TII->get(LMC.AndN2Opc), And->getOperand(0).getReg()) .addReg(ExecReg) .addReg(CCReg, getUndefRegState(CC->isUndef()), CC->getSubReg()); @@ -294,11 +296,11 @@ bool SIOptimizeExecMaskingPreRA::optimizeElseBranch(MachineBasicBlock &MBB) { // Check this is an else block. auto First = MBB.begin(); MachineInstr &SaveExecMI = *First; - if (SaveExecMI.getOpcode() != OrSaveExecOpc) + if (SaveExecMI.getOpcode() != LMC.OrSaveExecOpc) return false; auto I = llvm::find_if(MBB.terminators(), [this](const MachineInstr &MI) { - return MI.getOpcode() == XorTermrOpc; + return MI.getOpcode() == LMC.XorTermOpc; }); if (I == MBB.terminators().end()) return false; @@ -314,7 +316,7 @@ bool SIOptimizeExecMaskingPreRA::optimizeElseBranch(MachineBasicBlock &MBB) { MachineInstr *AndExecMI = nullptr; I--; while (I != First && !AndExecMI) { - if (I->getOpcode() == AndOpc && I->getOperand(0).getReg() == DstReg && + if (I->getOpcode() == LMC.AndOpc && I->getOperand(0).getReg() == DstReg && I->getOperand(1).getReg() == Register(ExecReg)) AndExecMI = &*I; I--; @@ -352,7 +354,7 @@ PreservedAnalyses SIOptimizeExecMaskingPreRAPass::run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM) { auto &LIS = MFAM.getResult(MF); - SIOptimizeExecMaskingPreRA(&LIS).run(MF); + SIOptimizeExecMaskingPreRA(MF, &LIS).run(MF); return PreservedAnalyses::all(); } @@ -362,23 +364,12 @@ bool SIOptimizeExecMaskingPreRALegacy::runOnMachineFunction( return false; auto *LIS = &getAnalysis().getLIS(); - return SIOptimizeExecMaskingPreRA(LIS).run(MF); + return SIOptimizeExecMaskingPreRA(MF, LIS).run(MF); } bool SIOptimizeExecMaskingPreRA::run(MachineFunction &MF) { - const GCNSubtarget &ST = MF.getSubtarget(); - TRI = ST.getRegisterInfo(); - TII = ST.getInstrInfo(); - MRI = &MF.getRegInfo(); - - const bool Wave32 = ST.isWave32(); - AndOpc = Wave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; - Andn2Opc = Wave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64; - OrSaveExecOpc = - Wave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64; - XorTermrOpc = Wave32 ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; - CondReg = MCRegister::from(Wave32 ? AMDGPU::VCC_LO : AMDGPU::VCC); - ExecReg = MCRegister::from(Wave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC); + CondReg = MCRegister::from(LMC.VccReg); + ExecReg = MCRegister::from(LMC.ExecReg); DenseSet RecalcRegs({AMDGPU::EXEC_LO, AMDGPU::EXEC_HI}); bool Changed = false; diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index 1198bbc310daa..6611e1e6507e1 100644 --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -69,6 +69,7 @@ #include "SIWholeQuadMode.h" #include "AMDGPU.h" +#include "AMDGPULaneMaskUtils.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/ADT/MapVector.h" @@ -155,7 +156,7 @@ class SIWholeQuadMode { MachineDominatorTree *MDT, MachinePostDominatorTree *PDT) : ST(&MF.getSubtarget()), TII(ST->getInstrInfo()), TRI(&TII->getRegisterInfo()), MRI(&MF.getRegInfo()), LIS(LIS), MDT(MDT), - PDT(PDT) {} + PDT(PDT), LMC(AMDGPU::LaneMaskConstants::get(*ST)) {} bool run(MachineFunction &MF); private: @@ -166,15 +167,8 @@ class SIWholeQuadMode { LiveIntervals *LIS; MachineDominatorTree *MDT; MachinePostDominatorTree *PDT; + const AMDGPU::LaneMaskConstants &LMC; - unsigned AndOpc; - unsigned AndTermOpc; - unsigned AndN2Opc; - unsigned XorOpc; - unsigned AndSaveExecOpc; - unsigned AndSaveExecTermOpc; - unsigned WQMOpc; - Register Exec; Register LiveMaskReg; DenseMap Instructions; @@ -882,14 +876,12 @@ MachineInstr *SIWholeQuadMode::lowerKillF32(MachineInstr &MI) { const MachineOperand &Op1 = MI.getOperand(1); // VCC represents lanes killed. - Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC; - if (TRI->isVGPR(*MRI, Op0.getReg())) { Opcode = AMDGPU::getVOPe32(Opcode); VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0); } else { VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)) - .addReg(VCC, RegState::Define) + .addReg(LMC.VccReg, RegState::Define) .addImm(0) // src0 modifiers .add(Op1) .addImm(0) // src1 modifiers @@ -898,9 +890,9 @@ MachineInstr *SIWholeQuadMode::lowerKillF32(MachineInstr &MI) { } MachineInstr *MaskUpdateMI = - BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg) + BuildMI(MBB, MI, DL, TII->get(LMC.AndN2Opc), LiveMaskReg) .addReg(LiveMaskReg) - .addReg(VCC); + .addReg(LMC.VccReg); // State of SCC represents whether any lanes are live in mask, // if SCC is 0 then no lanes will be alive anymore. @@ -908,7 +900,9 @@ MachineInstr *SIWholeQuadMode::lowerKillF32(MachineInstr &MI) { BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0)); MachineInstr *ExecMaskMI = - BuildMI(MBB, MI, DL, TII->get(AndN2Opc), Exec).addReg(Exec).addReg(VCC); + BuildMI(MBB, MI, DL, TII->get(LMC.AndN2Opc), LMC.ExecReg) + .addReg(LMC.ExecReg) + .addReg(LMC.VccReg); assert(MBB.succ_size() == 1); @@ -942,9 +936,9 @@ MachineInstr *SIWholeQuadMode::lowerKillI1(MachineInstr &MI, bool IsWQM) { if (Op.isImm()) { if (Op.getImm() == KillVal) { // Static: all active lanes are killed - MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg) + MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(LMC.AndN2Opc), LiveMaskReg) .addReg(LiveMaskReg) - .addReg(Exec); + .addReg(LMC.ExecReg); } else { // Static: kill does nothing bool IsLastTerminator = std::next(MI.getIterator()) == MBB.end(); @@ -964,14 +958,15 @@ MachineInstr *SIWholeQuadMode::lowerKillI1(MachineInstr &MI, bool IsWQM) { // Op represents live lanes after kill, // so exec mask needs to be factored in. TmpReg = MRI->createVirtualRegister(TRI->getBoolRC()); - ComputeKilledMaskMI = - BuildMI(MBB, MI, DL, TII->get(AndN2Opc), TmpReg).addReg(Exec).add(Op); - MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg) + ComputeKilledMaskMI = BuildMI(MBB, MI, DL, TII->get(LMC.AndN2Opc), TmpReg) + .addReg(LMC.ExecReg) + .add(Op); + MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(LMC.AndN2Opc), LiveMaskReg) .addReg(LiveMaskReg) .addReg(TmpReg); } else { // Op represents lanes to kill - MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg) + MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(LMC.AndN2Opc), LiveMaskReg) .addReg(LiveMaskReg) .add(Op); } @@ -990,24 +985,25 @@ MachineInstr *SIWholeQuadMode::lowerKillI1(MachineInstr &MI, bool IsWQM) { if (IsDemote) { // Demote - deactivate quads with only helper lanes LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC()); - WQMMaskMI = - BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg); - NewTerm = BuildMI(MBB, MI, DL, TII->get(AndOpc), Exec) - .addReg(Exec) + WQMMaskMI = BuildMI(MBB, MI, DL, TII->get(LMC.WQMOpc), LiveMaskWQM) + .addReg(LiveMaskReg); + NewTerm = BuildMI(MBB, MI, DL, TII->get(LMC.AndOpc), LMC.ExecReg) + .addReg(LMC.ExecReg) .addReg(LiveMaskWQM); } else { // Kill - deactivate lanes no longer in live mask if (Op.isImm()) { - unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0); + NewTerm = + BuildMI(MBB, &MI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addImm(0); } else if (!IsWQM) { - NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec) - .addReg(Exec) + NewTerm = BuildMI(MBB, &MI, DL, TII->get(LMC.AndOpc), LMC.ExecReg) + .addReg(LMC.ExecReg) .addReg(LiveMaskReg); } else { - unsigned Opcode = KillVal ? AndN2Opc : AndOpc; - NewTerm = - BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op); + unsigned Opcode = KillVal ? LMC.AndN2Opc : LMC.AndOpc; + NewTerm = BuildMI(MBB, &MI, DL, TII->get(Opcode), LMC.ExecReg) + .addReg(LMC.ExecReg) + .add(Op); } } @@ -1183,13 +1179,14 @@ void SIWholeQuadMode::toExact(MachineBasicBlock &MBB, MachineInstr *MI; if (SaveWQM) { - unsigned Opcode = IsTerminator ? AndSaveExecTermOpc : AndSaveExecOpc; + unsigned Opcode = + IsTerminator ? LMC.AndSaveExecTermOpc : LMC.AndSaveExecOpc; MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), SaveWQM) .addReg(LiveMaskReg); } else { - unsigned Opcode = IsTerminator ? AndTermOpc : AndOpc; - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), Exec) - .addReg(Exec) + unsigned Opcode = IsTerminator ? LMC.AndTermOpc : LMC.AndOpc; + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), LMC.ExecReg) + .addReg(LMC.ExecReg) .addReg(LiveMaskReg); } @@ -1203,10 +1200,11 @@ void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB, MachineInstr *MI; if (SavedWQM) { - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec) + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), LMC.ExecReg) .addReg(SavedWQM); } else { - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(WQMOpc), Exec).addReg(Exec); + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(LMC.WQMOpc), LMC.ExecReg) + .addReg(LMC.ExecReg); } LIS->InsertMachineInstrInMaps(*MI); @@ -1246,11 +1244,11 @@ void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB, if (CurrentStrictState == StateStrictWWM) { MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM), - Exec) + LMC.ExecReg) .addReg(SavedOrig); } else { MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WQM), - Exec) + LMC.ExecReg) .addReg(SavedOrig); } LIS->InsertMachineInstrInMaps(*MI); @@ -1280,7 +1278,7 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, BlockInfo &BI, if (IsEntry) { // Skip the instruction that saves LiveMask if (II != IE && II->getOpcode() == AMDGPU::COPY && - II->getOperand(1).getReg() == TRI->getExec()) + II->getOperand(1).getReg() == LMC.ExecReg) ++II; } @@ -1565,18 +1563,14 @@ bool SIWholeQuadMode::lowerKillInstrs(bool IsWQM) { void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) { MachineBasicBlock *MBB = MI.getParent(); - bool IsWave32 = ST->isWave32(); if (MI.getOpcode() == AMDGPU::SI_INIT_WHOLE_WAVE) { assert(MBB == &MBB->getParent()->front() && "init whole wave not in entry block"); Register EntryExec = MRI->createVirtualRegister(TRI->getBoolRC()); - MachineInstr *SaveExec = - BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(), - TII->get(IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 - : AMDGPU::S_OR_SAVEEXEC_B64), - EntryExec) - .addImm(-1); + MachineInstr *SaveExec = BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(), + TII->get(LMC.OrSaveExecOpc), EntryExec) + .addImm(-1); // Replace all uses of MI's destination reg with EntryExec. MRI->replaceRegWith(MI.getOperand(0).getReg(), EntryExec); @@ -1596,11 +1590,9 @@ void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) { if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) { // This should be before all vector instructions. - MachineInstr *InitMI = - BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(), - TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), - Exec) - .addImm(MI.getOperand(0).getImm()); + MachineInstr *InitMI = BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(), + TII->get(LMC.MovOpc), LMC.ExecReg) + .addImm(MI.getOperand(0).getImm()); if (LIS) { LIS->RemoveMachineInstrFromMaps(MI); LIS->InsertMachineInstrInMaps(*InitMI); @@ -1644,19 +1636,14 @@ void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) { auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg) .addReg(InputReg) .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000); - auto BfmMI = - BuildMI(*MBB, FirstMI, DL, - TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec) - .addReg(CountReg) - .addImm(0); + auto BfmMI = BuildMI(*MBB, FirstMI, DL, TII->get(LMC.BfmOpc), LMC.ExecReg) + .addReg(CountReg) + .addImm(0); auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32)) .addReg(CountReg, RegState::Kill) .addImm(WavefrontSize); auto CmovMI = - BuildMI(*MBB, FirstMI, DL, - TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64), - Exec) - .addImm(-1); + BuildMI(*MBB, FirstMI, DL, TII->get(LMC.CMovOpc), LMC.ExecReg).addImm(-1); if (!LIS) { MI.eraseFromParent(); @@ -1711,30 +1698,10 @@ bool SIWholeQuadMode::run(MachineFunction &MF) { SetInactiveInstrs.clear(); StateTransition.clear(); - if (ST->isWave32()) { - AndOpc = AMDGPU::S_AND_B32; - AndTermOpc = AMDGPU::S_AND_B32_term; - AndN2Opc = AMDGPU::S_ANDN2_B32; - XorOpc = AMDGPU::S_XOR_B32; - AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32; - AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B32_term; - WQMOpc = AMDGPU::S_WQM_B32; - Exec = AMDGPU::EXEC_LO; - } else { - AndOpc = AMDGPU::S_AND_B64; - AndTermOpc = AMDGPU::S_AND_B64_term; - AndN2Opc = AMDGPU::S_ANDN2_B64; - XorOpc = AMDGPU::S_XOR_B64; - AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64; - AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B64_term; - WQMOpc = AMDGPU::S_WQM_B64; - Exec = AMDGPU::EXEC; - } - const char GlobalFlags = analyzeFunction(MF); bool Changed = false; - LiveMaskReg = Exec; + LiveMaskReg = LMC.ExecReg; MachineBasicBlock &Entry = MF.front(); MachineBasicBlock::iterator EntryMI = lowerInitExecInstrs(Entry, Changed); @@ -1748,7 +1715,7 @@ bool SIWholeQuadMode::run(MachineFunction &MF) { LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC()); MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg) - .addReg(Exec); + .addReg(LMC.ExecReg); LIS->InsertMachineInstrInMaps(*MI); Changed = true; } @@ -1779,8 +1746,9 @@ bool SIWholeQuadMode::run(MachineFunction &MF) { Changed |= lowerKillInstrs(false); } else if (GlobalFlags == StateWQM) { // Shader only needs WQM - auto MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(WQMOpc), Exec) - .addReg(Exec); + auto MI = + BuildMI(Entry, EntryMI, DebugLoc(), TII->get(LMC.WQMOpc), LMC.ExecReg) + .addReg(LMC.ExecReg); LIS->InsertMachineInstrInMaps(*MI); lowerKillInstrs(true); Changed = true; @@ -1798,7 +1766,7 @@ bool SIWholeQuadMode::run(MachineFunction &MF) { } // Compute live range for live mask - if (LiveMaskReg != Exec) + if (LiveMaskReg != LMC.ExecReg) LIS->createAndComputeVirtRegInterval(LiveMaskReg); // Physical registers like SCC aren't tracked by default anyway, so just