Skip to content

Commit fdb06d9

Browse files
authored
[AMDGPU] Refactor out common exec mask opcode patterns (NFCI) (#154718)
Create utility mechanism for finding wave size dependent opcodes used to manipulate exec/lane masks.
1 parent 10d0d95 commit fdb06d9

11 files changed

+338
-386
lines changed
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
//===- AMDGPULaneMaskUtils.h - Exec/lane mask helper functions -*- C++ -*--===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEMASKUTILS_H
10+
#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEMASKUTILS_H
11+
12+
#include "GCNSubtarget.h"
13+
#include "llvm/CodeGen/Register.h"
14+
15+
namespace llvm {
16+
17+
class GCNSubtarget;
18+
19+
namespace AMDGPU {
20+
21+
class LaneMaskConstants {
22+
public:
23+
const Register ExecReg;
24+
const Register VccReg;
25+
const unsigned AndOpc;
26+
const unsigned AndTermOpc;
27+
const unsigned AndN2Opc;
28+
const unsigned AndN2SaveExecOpc;
29+
const unsigned AndN2TermOpc;
30+
const unsigned AndSaveExecOpc;
31+
const unsigned AndSaveExecTermOpc;
32+
const unsigned BfmOpc;
33+
const unsigned CMovOpc;
34+
const unsigned CSelectOpc;
35+
const unsigned MovOpc;
36+
const unsigned MovTermOpc;
37+
const unsigned OrOpc;
38+
const unsigned OrTermOpc;
39+
const unsigned OrSaveExecOpc;
40+
const unsigned XorOpc;
41+
const unsigned XorTermOpc;
42+
const unsigned WQMOpc;
43+
44+
constexpr LaneMaskConstants(bool IsWave32)
45+
: ExecReg(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC),
46+
VccReg(IsWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC),
47+
AndOpc(IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64),
48+
AndTermOpc(IsWave32 ? AMDGPU::S_AND_B32_term : AMDGPU::S_AND_B64_term),
49+
AndN2Opc(IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64),
50+
AndN2SaveExecOpc(IsWave32 ? AMDGPU::S_ANDN2_SAVEEXEC_B32
51+
: AMDGPU::S_ANDN2_SAVEEXEC_B64),
52+
AndN2TermOpc(IsWave32 ? AMDGPU::S_ANDN2_B32_term
53+
: AMDGPU::S_ANDN2_B64_term),
54+
AndSaveExecOpc(IsWave32 ? AMDGPU::S_AND_SAVEEXEC_B32
55+
: AMDGPU::S_AND_SAVEEXEC_B64),
56+
AndSaveExecTermOpc(IsWave32 ? AMDGPU::S_AND_SAVEEXEC_B32_term
57+
: AMDGPU::S_AND_SAVEEXEC_B64_term),
58+
BfmOpc(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64),
59+
CMovOpc(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
60+
CSelectOpc(IsWave32 ? AMDGPU::S_CSELECT_B32 : AMDGPU::S_CSELECT_B64),
61+
MovOpc(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
62+
MovTermOpc(IsWave32 ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term),
63+
OrOpc(IsWave32 ? AMDGPU::S_OR_B32 : AMDGPU::S_OR_B64),
64+
OrTermOpc(IsWave32 ? AMDGPU::S_OR_B32_term : AMDGPU::S_OR_B64_term),
65+
OrSaveExecOpc(IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32
66+
: AMDGPU::S_OR_SAVEEXEC_B64),
67+
XorOpc(IsWave32 ? AMDGPU::S_XOR_B32 : AMDGPU::S_XOR_B64),
68+
XorTermOpc(IsWave32 ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term),
69+
WQMOpc(IsWave32 ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64) {}
70+
71+
static inline const LaneMaskConstants &get(const GCNSubtarget &ST);
72+
};
73+
74+
static constexpr LaneMaskConstants LaneMaskConstants32 =
75+
LaneMaskConstants(/*IsWave32=*/true);
76+
static constexpr LaneMaskConstants LaneMaskConstants64 =
77+
LaneMaskConstants(/*IsWave32=*/false);
78+
79+
inline const LaneMaskConstants &LaneMaskConstants::get(const GCNSubtarget &ST) {
80+
unsigned WavefrontSize = ST.getWavefrontSize();
81+
assert(WavefrontSize == 32 || WavefrontSize == 64);
82+
return WavefrontSize == 32 ? LaneMaskConstants32 : LaneMaskConstants64;
83+
}
84+
85+
} // end namespace AMDGPU
86+
87+
} // end namespace llvm
88+
89+
#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEMASKUTILS_H

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 13 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@
7373
#include "AMDGPU.h"
7474
#include "AMDGPUGlobalISelUtils.h"
7575
#include "AMDGPUInstrInfo.h"
76+
#include "AMDGPULaneMaskUtils.h"
7677
#include "GCNSubtarget.h"
7778
#include "SIMachineFunctionInfo.h"
7879
#include "SIRegisterInfo.h"
@@ -783,17 +784,8 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
783784
MachineFunction *MF = &B.getMF();
784785

785786
const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
786-
const unsigned MovExecOpc =
787-
Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
788-
const unsigned MovExecTermOpc =
789-
Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
790-
791-
const unsigned XorTermOpc = Subtarget.isWave32() ?
792-
AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
793-
const unsigned AndSaveExecOpc = Subtarget.isWave32() ?
794-
AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
795-
const unsigned ExecReg = Subtarget.isWave32() ?
796-
AMDGPU::EXEC_LO : AMDGPU::EXEC;
787+
const AMDGPU::LaneMaskConstants &LMC =
788+
AMDGPU::LaneMaskConstants::get(Subtarget);
797789

798790
#ifndef NDEBUG
799791
const int OrigRangeSize = std::distance(Range.begin(), Range.end());
@@ -941,19 +933,19 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
941933
MRI.setRegClass(CondReg, WaveRC);
942934

943935
// Update EXEC, save the original EXEC value to VCC.
944-
B.buildInstr(AndSaveExecOpc)
945-
.addDef(NewExec)
946-
.addReg(CondReg, RegState::Kill);
936+
B.buildInstr(LMC.AndSaveExecOpc)
937+
.addDef(NewExec)
938+
.addReg(CondReg, RegState::Kill);
947939

948940
MRI.setSimpleHint(NewExec, CondReg);
949941

950942
B.setInsertPt(*BodyBB, BodyBB->end());
951943

952944
// Update EXEC, switch all done bits to 0 and all todo bits to 1.
953-
B.buildInstr(XorTermOpc)
954-
.addDef(ExecReg)
955-
.addReg(ExecReg)
956-
.addReg(NewExec);
945+
B.buildInstr(LMC.XorTermOpc)
946+
.addDef(LMC.ExecReg)
947+
.addReg(LMC.ExecReg)
948+
.addReg(NewExec);
957949

958950
// XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
959951
// s_cbranch_scc0?
@@ -962,14 +954,12 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
962954
B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
963955

964956
// Save the EXEC mask before the loop.
965-
BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg)
966-
.addReg(ExecReg);
957+
BuildMI(MBB, MBB.end(), DL, TII->get(LMC.MovOpc), SaveExecReg)
958+
.addReg(LMC.ExecReg);
967959

968960
// Restore the EXEC mask after the loop.
969961
B.setMBB(*RestoreExecBB);
970-
B.buildInstr(MovExecTermOpc)
971-
.addDef(ExecReg)
972-
.addReg(SaveExecReg);
962+
B.buildInstr(LMC.MovTermOpc).addDef(LMC.ExecReg).addReg(SaveExecReg);
973963

974964
// Set the insert point after the original instruction, so any new
975965
// instructions will be in the remainder.

llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@
6666

6767
#include "SIFixSGPRCopies.h"
6868
#include "AMDGPU.h"
69+
#include "AMDGPULaneMaskUtils.h"
6970
#include "GCNSubtarget.h"
7071
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
7172
#include "llvm/CodeGen/MachineDominators.h"
@@ -1145,7 +1146,8 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
11451146
}
11461147

11471148
void SIFixSGPRCopies::fixSCCCopies(MachineFunction &MF) {
1148-
bool IsWave32 = MF.getSubtarget<GCNSubtarget>().isWave32();
1149+
const AMDGPU::LaneMaskConstants &LMC =
1150+
AMDGPU::LaneMaskConstants::get(MF.getSubtarget<GCNSubtarget>());
11491151
for (MachineBasicBlock &MBB : MF) {
11501152
for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
11511153
++I) {
@@ -1159,10 +1161,7 @@ void SIFixSGPRCopies::fixSCCCopies(MachineFunction &MF) {
11591161
Register SCCCopy =
11601162
MRI->createVirtualRegister(TRI->getWaveMaskRegClass());
11611163
I = BuildMI(*MI.getParent(), std::next(MachineBasicBlock::iterator(MI)),
1162-
MI.getDebugLoc(),
1163-
TII->get(IsWave32 ? AMDGPU::S_CSELECT_B32
1164-
: AMDGPU::S_CSELECT_B64),
1165-
SCCCopy)
1164+
MI.getDebugLoc(), TII->get(LMC.CSelectOpc), SCCCopy)
11661165
.addImm(-1)
11671166
.addImm(0);
11681167
I = BuildMI(*MI.getParent(), std::next(I), I->getDebugLoc(),
@@ -1172,14 +1171,12 @@ void SIFixSGPRCopies::fixSCCCopies(MachineFunction &MF) {
11721171
continue;
11731172
}
11741173
if (DstReg == AMDGPU::SCC) {
1175-
unsigned Opcode = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
1176-
Register Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
11771174
Register Tmp = MRI->createVirtualRegister(TRI->getBoolRC());
11781175
I = BuildMI(*MI.getParent(), std::next(MachineBasicBlock::iterator(MI)),
1179-
MI.getDebugLoc(), TII->get(Opcode))
1176+
MI.getDebugLoc(), TII->get(LMC.AndOpc))
11801177
.addReg(Tmp, getDefRegState(true))
11811178
.addReg(SrcReg)
1182-
.addReg(Exec);
1179+
.addReg(LMC.ExecReg);
11831180
MI.eraseFromParent();
11841181
}
11851182
}

llvm/lib/Target/AMDGPU/SIFrameLowering.cpp

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
#include "SIFrameLowering.h"
1010
#include "AMDGPU.h"
11+
#include "AMDGPULaneMaskUtils.h"
1112
#include "GCNSubtarget.h"
1213
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
1314
#include "SIMachineFunctionInfo.h"
@@ -984,6 +985,7 @@ void SIFrameLowering::emitCSRSpillStores(
984985
const SIInstrInfo *TII = ST.getInstrInfo();
985986
const SIRegisterInfo &TRI = TII->getRegisterInfo();
986987
MachineRegisterInfo &MRI = MF.getRegInfo();
988+
const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
987989

988990
// Spill Whole-Wave Mode VGPRs. Save only the inactive lanes of the scratch
989991
// registers. However, save all lanes of callee-saved VGPRs. Due to this, we
@@ -1015,8 +1017,7 @@ void SIFrameLowering::emitCSRSpillStores(
10151017
StoreWWMRegisters(WWMScratchRegs);
10161018

10171019
auto EnableAllLanes = [&]() {
1018-
unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1019-
BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
1020+
BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addImm(-1);
10201021
};
10211022

10221023
if (!WWMCalleeSavedRegs.empty()) {
@@ -1043,8 +1044,7 @@ void SIFrameLowering::emitCSRSpillStores(
10431044
TII->getWholeWaveFunctionSetup(MF)->eraseFromParent();
10441045
} else if (ScratchExecCopy) {
10451046
// FIXME: Split block and make terminator.
1046-
unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1047-
BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec())
1047+
BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg)
10481048
.addReg(ScratchExecCopy, RegState::Kill);
10491049
LiveUnits.addReg(ScratchExecCopy);
10501050
}
@@ -1092,6 +1092,7 @@ void SIFrameLowering::emitCSRSpillRestores(
10921092
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
10931093
const SIInstrInfo *TII = ST.getInstrInfo();
10941094
const SIRegisterInfo &TRI = TII->getRegisterInfo();
1095+
const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
10951096
Register FramePtrReg = FuncInfo->getFrameOffsetReg();
10961097

10971098
for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) {
@@ -1147,16 +1148,14 @@ void SIFrameLowering::emitCSRSpillRestores(
11471148
Register OrigExec = Return.getOperand(0).getReg();
11481149

11491150
if (!WWMScratchRegs.empty()) {
1150-
unsigned XorOpc = ST.isWave32() ? AMDGPU::S_XOR_B32 : AMDGPU::S_XOR_B64;
1151-
BuildMI(MBB, MBBI, DL, TII->get(XorOpc), TRI.getExec())
1151+
BuildMI(MBB, MBBI, DL, TII->get(LMC.XorOpc), LMC.ExecReg)
11521152
.addReg(OrigExec)
11531153
.addImm(-1);
11541154
RestoreWWMRegisters(WWMScratchRegs);
11551155
}
11561156

11571157
// Restore original EXEC.
1158-
unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1159-
BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addReg(OrigExec);
1158+
BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addReg(OrigExec);
11601159

11611160
// Drop the first operand and update the opcode.
11621161
Return.removeOperand(0);
@@ -1173,8 +1172,7 @@ void SIFrameLowering::emitCSRSpillRestores(
11731172
RestoreWWMRegisters(WWMScratchRegs);
11741173
if (!WWMCalleeSavedRegs.empty()) {
11751174
if (ScratchExecCopy) {
1176-
unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1177-
BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
1175+
BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addImm(-1);
11781176
} else {
11791177
ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
11801178
/*IsProlog*/ false,
@@ -1185,8 +1183,7 @@ void SIFrameLowering::emitCSRSpillRestores(
11851183
RestoreWWMRegisters(WWMCalleeSavedRegs);
11861184
if (ScratchExecCopy) {
11871185
// FIXME: Split block and make terminator.
1188-
unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1189-
BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec())
1186+
BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg)
11901187
.addReg(ScratchExecCopy, RegState::Kill);
11911188
}
11921189
}

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 9 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include "SIISelLowering.h"
1515
#include "AMDGPU.h"
1616
#include "AMDGPUInstrInfo.h"
17+
#include "AMDGPULaneMaskUtils.h"
1718
#include "AMDGPUTargetMachine.h"
1819
#include "GCNSubtarget.h"
1920
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
@@ -5027,6 +5028,7 @@ emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
50275028
MachineFunction *MF = OrigBB.getParent();
50285029
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
50295030
const SIRegisterInfo *TRI = ST.getRegisterInfo();
5031+
const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
50305032
MachineBasicBlock::iterator I = LoopBB.begin();
50315033

50325034
const TargetRegisterClass *BoolRC = TRI->getBoolRC();
@@ -5058,10 +5060,7 @@ emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
50585060
.addReg(Idx.getReg(), 0, Idx.getSubReg());
50595061

50605062
// Update EXEC, save the original EXEC value to VCC.
5061-
BuildMI(LoopBB, I, DL,
5062-
TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
5063-
: AMDGPU::S_AND_SAVEEXEC_B64),
5064-
NewExec)
5063+
BuildMI(LoopBB, I, DL, TII->get(LMC.AndSaveExecOpc), NewExec)
50655064
.addReg(CondReg, RegState::Kill);
50665065

50675066
MRI.setSimpleHint(NewExec, CondReg);
@@ -5088,13 +5087,9 @@ emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
50885087
}
50895088

50905089
// Update EXEC, switch all done bits to 0 and all todo bits to 1.
5091-
unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
50925090
MachineInstr *InsertPt =
5093-
BuildMI(LoopBB, I, DL,
5094-
TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
5095-
: AMDGPU::S_XOR_B64_term),
5096-
Exec)
5097-
.addReg(Exec)
5091+
BuildMI(LoopBB, I, DL, TII->get(LMC.XorTermOpc), LMC.ExecReg)
5092+
.addReg(LMC.ExecReg)
50985093
.addReg(NewExec);
50995094

51005095
// XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
@@ -5129,15 +5124,14 @@ loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI,
51295124
Register DstReg = MI.getOperand(0).getReg();
51305125
Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
51315126
Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
5132-
unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5133-
unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5127+
const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
51345128

51355129
BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
51365130

51375131
// Save the EXEC mask
51385132
// clang-format off
5139-
BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
5140-
.addReg(Exec);
5133+
BuildMI(MBB, I, DL, TII->get(LMC.MovOpc), SaveExec)
5134+
.addReg(LMC.ExecReg);
51415135
// clang-format on
51425136

51435137
auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false);
@@ -5157,7 +5151,7 @@ loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI,
51575151
LoopBB->addSuccessor(LandingPad);
51585152
MachineBasicBlock::iterator First = LandingPad->begin();
51595153
// clang-format off
5160-
BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
5154+
BuildMI(*LandingPad, First, DL, TII->get(LMC.MovOpc), LMC.ExecReg)
51615155
.addReg(SaveExec);
51625156
// clang-format on
51635157

0 commit comments

Comments
 (0)