Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 89 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPULaneMaskUtils.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
//===- AMDGPULaneMaskUtils.h - Exec/lane mask helper functions -*- C++ -*--===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEMASKUTILS_H
#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEMASKUTILS_H

#include "GCNSubtarget.h"
#include "llvm/CodeGen/Register.h"

namespace llvm {

class GCNSubtarget;

namespace AMDGPU {

class LaneMaskConstants {
public:
const Register ExecReg;
const Register VccReg;
const unsigned AndOpc;
const unsigned AndTermOpc;
const unsigned AndN2Opc;
const unsigned AndN2SaveExecOpc;
const unsigned AndN2TermOpc;
const unsigned AndSaveExecOpc;
const unsigned AndSaveExecTermOpc;
const unsigned BfmOpc;
const unsigned CMovOpc;
const unsigned CSelectOpc;
const unsigned MovOpc;
const unsigned MovTermOpc;
const unsigned OrOpc;
const unsigned OrTermOpc;
const unsigned OrSaveExecOpc;
const unsigned XorOpc;
const unsigned XorTermOpc;
const unsigned WQMOpc;

constexpr LaneMaskConstants(bool IsWave32)
: ExecReg(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC),
VccReg(IsWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC),
AndOpc(IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64),
AndTermOpc(IsWave32 ? AMDGPU::S_AND_B32_term : AMDGPU::S_AND_B64_term),
AndN2Opc(IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64),
AndN2SaveExecOpc(IsWave32 ? AMDGPU::S_ANDN2_SAVEEXEC_B32
: AMDGPU::S_ANDN2_SAVEEXEC_B64),
AndN2TermOpc(IsWave32 ? AMDGPU::S_ANDN2_B32_term
: AMDGPU::S_ANDN2_B64_term),
AndSaveExecOpc(IsWave32 ? AMDGPU::S_AND_SAVEEXEC_B32
: AMDGPU::S_AND_SAVEEXEC_B64),
AndSaveExecTermOpc(IsWave32 ? AMDGPU::S_AND_SAVEEXEC_B32_term
: AMDGPU::S_AND_SAVEEXEC_B64_term),
BfmOpc(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64),
CMovOpc(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
CSelectOpc(IsWave32 ? AMDGPU::S_CSELECT_B32 : AMDGPU::S_CSELECT_B64),
MovOpc(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
MovTermOpc(IsWave32 ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term),
OrOpc(IsWave32 ? AMDGPU::S_OR_B32 : AMDGPU::S_OR_B64),
OrTermOpc(IsWave32 ? AMDGPU::S_OR_B32_term : AMDGPU::S_OR_B64_term),
OrSaveExecOpc(IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32
: AMDGPU::S_OR_SAVEEXEC_B64),
XorOpc(IsWave32 ? AMDGPU::S_XOR_B32 : AMDGPU::S_XOR_B64),
XorTermOpc(IsWave32 ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term),
WQMOpc(IsWave32 ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64) {}

static inline const LaneMaskConstants &get(const GCNSubtarget &ST);
};

static constexpr LaneMaskConstants LaneMaskConstants32 =
LaneMaskConstants(/*IsWave32=*/true);
static constexpr LaneMaskConstants LaneMaskConstants64 =
LaneMaskConstants(/*IsWave32=*/false);

inline const LaneMaskConstants &LaneMaskConstants::get(const GCNSubtarget &ST) {
unsigned WavefrontSize = ST.getWavefrontSize();
assert(WavefrontSize == 32 || WavefrontSize == 64);
return WavefrontSize == 32 ? LaneMaskConstants32 : LaneMaskConstants64;
}

} // end namespace AMDGPU

} // end namespace llvm

#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEMASKUTILS_H
36 changes: 13 additions & 23 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@
#include "AMDGPU.h"
#include "AMDGPUGlobalISelUtils.h"
#include "AMDGPUInstrInfo.h"
#include "AMDGPULaneMaskUtils.h"
#include "GCNSubtarget.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
Expand Down Expand Up @@ -783,17 +784,8 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
MachineFunction *MF = &B.getMF();

const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
const unsigned MovExecOpc =
Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
const unsigned MovExecTermOpc =
Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;

const unsigned XorTermOpc = Subtarget.isWave32() ?
AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
const unsigned AndSaveExecOpc = Subtarget.isWave32() ?
AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
const unsigned ExecReg = Subtarget.isWave32() ?
AMDGPU::EXEC_LO : AMDGPU::EXEC;
const AMDGPU::LaneMaskConstants &LMC =
AMDGPU::LaneMaskConstants::get(Subtarget);

#ifndef NDEBUG
const int OrigRangeSize = std::distance(Range.begin(), Range.end());
Expand Down Expand Up @@ -941,19 +933,19 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
MRI.setRegClass(CondReg, WaveRC);

// Update EXEC, save the original EXEC value to VCC.
B.buildInstr(AndSaveExecOpc)
.addDef(NewExec)
.addReg(CondReg, RegState::Kill);
B.buildInstr(LMC.AndSaveExecOpc)
.addDef(NewExec)
.addReg(CondReg, RegState::Kill);

MRI.setSimpleHint(NewExec, CondReg);

B.setInsertPt(*BodyBB, BodyBB->end());

// Update EXEC, switch all done bits to 0 and all todo bits to 1.
B.buildInstr(XorTermOpc)
.addDef(ExecReg)
.addReg(ExecReg)
.addReg(NewExec);
B.buildInstr(LMC.XorTermOpc)
.addDef(LMC.ExecReg)
.addReg(LMC.ExecReg)
.addReg(NewExec);

// XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
// s_cbranch_scc0?
Expand All @@ -962,14 +954,12 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);

// Save the EXEC mask before the loop.
BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg)
.addReg(ExecReg);
BuildMI(MBB, MBB.end(), DL, TII->get(LMC.MovOpc), SaveExecReg)
.addReg(LMC.ExecReg);

// Restore the EXEC mask after the loop.
B.setMBB(*RestoreExecBB);
B.buildInstr(MovExecTermOpc)
.addDef(ExecReg)
.addReg(SaveExecReg);
B.buildInstr(LMC.MovTermOpc).addDef(LMC.ExecReg).addReg(SaveExecReg);

// Set the insert point after the original instruction, so any new
// instructions will be in the remainder.
Expand Down
15 changes: 6 additions & 9 deletions llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@

#include "SIFixSGPRCopies.h"
#include "AMDGPU.h"
#include "AMDGPULaneMaskUtils.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/MachineDominators.h"
Expand Down Expand Up @@ -1145,7 +1146,8 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
}

void SIFixSGPRCopies::fixSCCCopies(MachineFunction &MF) {
bool IsWave32 = MF.getSubtarget<GCNSubtarget>().isWave32();
const AMDGPU::LaneMaskConstants &LMC =
AMDGPU::LaneMaskConstants::get(MF.getSubtarget<GCNSubtarget>());
for (MachineBasicBlock &MBB : MF) {
for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
++I) {
Expand All @@ -1159,10 +1161,7 @@ void SIFixSGPRCopies::fixSCCCopies(MachineFunction &MF) {
Register SCCCopy =
MRI->createVirtualRegister(TRI->getWaveMaskRegClass());
I = BuildMI(*MI.getParent(), std::next(MachineBasicBlock::iterator(MI)),
MI.getDebugLoc(),
TII->get(IsWave32 ? AMDGPU::S_CSELECT_B32
: AMDGPU::S_CSELECT_B64),
SCCCopy)
MI.getDebugLoc(), TII->get(LMC.CSelectOpc), SCCCopy)
.addImm(-1)
.addImm(0);
I = BuildMI(*MI.getParent(), std::next(I), I->getDebugLoc(),
Expand All @@ -1172,14 +1171,12 @@ void SIFixSGPRCopies::fixSCCCopies(MachineFunction &MF) {
continue;
}
if (DstReg == AMDGPU::SCC) {
unsigned Opcode = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
Register Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
Register Tmp = MRI->createVirtualRegister(TRI->getBoolRC());
I = BuildMI(*MI.getParent(), std::next(MachineBasicBlock::iterator(MI)),
MI.getDebugLoc(), TII->get(Opcode))
MI.getDebugLoc(), TII->get(LMC.AndOpc))
.addReg(Tmp, getDefRegState(true))
.addReg(SrcReg)
.addReg(Exec);
.addReg(LMC.ExecReg);
MI.eraseFromParent();
}
}
Expand Down
21 changes: 9 additions & 12 deletions llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#include "SIFrameLowering.h"
#include "AMDGPU.h"
#include "AMDGPULaneMaskUtils.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
Expand Down Expand Up @@ -984,6 +985,7 @@ void SIFrameLowering::emitCSRSpillStores(
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo &TRI = TII->getRegisterInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);

// Spill Whole-Wave Mode VGPRs. Save only the inactive lanes of the scratch
// registers. However, save all lanes of callee-saved VGPRs. Due to this, we
Expand Down Expand Up @@ -1015,8 +1017,7 @@ void SIFrameLowering::emitCSRSpillStores(
StoreWWMRegisters(WWMScratchRegs);

auto EnableAllLanes = [&]() {
unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addImm(-1);
};

if (!WWMCalleeSavedRegs.empty()) {
Expand All @@ -1043,8 +1044,7 @@ void SIFrameLowering::emitCSRSpillStores(
TII->getWholeWaveFunctionSetup(MF)->eraseFromParent();
} else if (ScratchExecCopy) {
// FIXME: Split block and make terminator.
unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec())
BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg)
.addReg(ScratchExecCopy, RegState::Kill);
LiveUnits.addReg(ScratchExecCopy);
}
Expand Down Expand Up @@ -1092,6 +1092,7 @@ void SIFrameLowering::emitCSRSpillRestores(
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo &TRI = TII->getRegisterInfo();
const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
Register FramePtrReg = FuncInfo->getFrameOffsetReg();

for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) {
Expand Down Expand Up @@ -1147,16 +1148,14 @@ void SIFrameLowering::emitCSRSpillRestores(
Register OrigExec = Return.getOperand(0).getReg();

if (!WWMScratchRegs.empty()) {
unsigned XorOpc = ST.isWave32() ? AMDGPU::S_XOR_B32 : AMDGPU::S_XOR_B64;
BuildMI(MBB, MBBI, DL, TII->get(XorOpc), TRI.getExec())
BuildMI(MBB, MBBI, DL, TII->get(LMC.XorOpc), LMC.ExecReg)
.addReg(OrigExec)
.addImm(-1);
RestoreWWMRegisters(WWMScratchRegs);
}

// Restore original EXEC.
unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addReg(OrigExec);
BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addReg(OrigExec);

// Drop the first operand and update the opcode.
Return.removeOperand(0);
Expand All @@ -1173,8 +1172,7 @@ void SIFrameLowering::emitCSRSpillRestores(
RestoreWWMRegisters(WWMScratchRegs);
if (!WWMCalleeSavedRegs.empty()) {
if (ScratchExecCopy) {
unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addImm(-1);
} else {
ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
/*IsProlog*/ false,
Expand All @@ -1185,8 +1183,7 @@ void SIFrameLowering::emitCSRSpillRestores(
RestoreWWMRegisters(WWMCalleeSavedRegs);
if (ScratchExecCopy) {
// FIXME: Split block and make terminator.
unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec())
BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg)
.addReg(ScratchExecCopy, RegState::Kill);
}
}
Expand Down
24 changes: 9 additions & 15 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "SIISelLowering.h"
#include "AMDGPU.h"
#include "AMDGPUInstrInfo.h"
#include "AMDGPULaneMaskUtils.h"
#include "AMDGPUTargetMachine.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
Expand Down Expand Up @@ -5027,6 +5028,7 @@ emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
MachineFunction *MF = OrigBB.getParent();
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
MachineBasicBlock::iterator I = LoopBB.begin();

const TargetRegisterClass *BoolRC = TRI->getBoolRC();
Expand Down Expand Up @@ -5058,10 +5060,7 @@ emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
.addReg(Idx.getReg(), 0, Idx.getSubReg());

// Update EXEC, save the original EXEC value to VCC.
BuildMI(LoopBB, I, DL,
TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
: AMDGPU::S_AND_SAVEEXEC_B64),
NewExec)
BuildMI(LoopBB, I, DL, TII->get(LMC.AndSaveExecOpc), NewExec)
.addReg(CondReg, RegState::Kill);

MRI.setSimpleHint(NewExec, CondReg);
Expand All @@ -5088,13 +5087,9 @@ emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
}

// Update EXEC, switch all done bits to 0 and all todo bits to 1.
unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
MachineInstr *InsertPt =
BuildMI(LoopBB, I, DL,
TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
: AMDGPU::S_XOR_B64_term),
Exec)
.addReg(Exec)
BuildMI(LoopBB, I, DL, TII->get(LMC.XorTermOpc), LMC.ExecReg)
.addReg(LMC.ExecReg)
.addReg(NewExec);

// XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
Expand Down Expand Up @@ -5129,15 +5124,14 @@ loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI,
Register DstReg = MI.getOperand(0).getReg();
Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);

BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);

// Save the EXEC mask
// clang-format off
BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
.addReg(Exec);
BuildMI(MBB, I, DL, TII->get(LMC.MovOpc), SaveExec)
.addReg(LMC.ExecReg);
// clang-format on

auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false);
Expand All @@ -5157,7 +5151,7 @@ loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI,
LoopBB->addSuccessor(LandingPad);
MachineBasicBlock::iterator First = LandingPad->begin();
// clang-format off
BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
BuildMI(*LandingPad, First, DL, TII->get(LMC.MovOpc), LMC.ExecReg)
.addReg(SaveExec);
// clang-format on

Expand Down
Loading