Skip to content

Commit ca1f20e

Browse files
cdevadaskzhuravl
authored andcommitted
[AMDGPU] Split vgpr regalloc pipeline (llvm#93526)
Allocating wwm-registers and regular VGPR operands together imposes many challenges in the way the registers are reused during allocation. There are times when regalloc reuses the registers of regular VGPRs operations for wwm-operations in a small range leading to unwantedly clobbering their inactive lanes causing correctness issues which are hard to trace. This patch splits the VGPR allocation pipeline further to allocate wwm-registers first and the regular VGPR operands in a separate pipeline. The splitting would ensure that the physical registers used for wwm allocations won't taken part in the next allocation pipeline to avoid any such clobbering. Change-Id: Ie5a8f7a934c39eac1c6bd38b68a721f5178ddfbe
1 parent 539e3d6 commit ca1f20e

File tree

79 files changed

+8300
-8738
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

79 files changed

+8300
-8738
lines changed

llvm/include/llvm/CodeGen/MachineRegisterInfo.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,8 @@ class MachineRegisterInfo {
184184
TheDelegate->MRI_NoteCloneVirtualRegister(NewReg, SrcReg);
185185
}
186186

187+
const MachineFunction &getMF() const { return *MF; }
188+
187189
//===--------------------------------------------------------------------===//
188190
// Function State
189191
//===--------------------------------------------------------------------===//

llvm/lib/Target/AMDGPU/AMDGPU.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ ModulePass *createAMDGPURemoveIncompatibleFunctionsPass(const TargetMachine *);
5656
FunctionPass *createAMDGPUCodeGenPreparePass();
5757
FunctionPass *createAMDGPULateCodeGenPreparePass();
5858
FunctionPass *createAMDGPUMachineCFGStructurizerPass();
59+
FunctionPass *createAMDGPUReserveWWMRegsPass();
5960
FunctionPass *createAMDGPURewriteOutArgumentsPass();
6061
ModulePass *
6162
createAMDGPULowerModuleLDSLegacyPass(const AMDGPUTargetMachine *TM = nullptr);
@@ -136,6 +137,9 @@ struct AMDGPULowerModuleLDSPass : PassInfoMixin<AMDGPULowerModuleLDSPass> {
136137
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
137138
};
138139

140+
void initializeAMDGPUReserveWWMRegsPass(PassRegistry &);
141+
extern char &AMDGPUReserveWWMRegsID;
142+
139143
void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &);
140144
extern char &AMDGPURewriteOutArgumentsID;
141145

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
//===-- AMDGPUReserveWWMRegs.cpp - Add WWM Regs to reserved regs list -----===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
/// \file
10+
/// This pass should be invoked at the end of wwm-regalloc pipeline.
11+
/// It identifies the WWM regs allocated during this pipeline and add
12+
/// them to the list of reserved registers so that they won't be available for
13+
/// regular VGPR allocation in the subsequent regalloc pipeline.
14+
//
15+
//===----------------------------------------------------------------------===//
16+
17+
#include "AMDGPU.h"
18+
#include "GCNSubtarget.h"
19+
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20+
#include "SIMachineFunctionInfo.h"
21+
#include "llvm/CodeGen/LiveIntervals.h"
22+
#include "llvm/CodeGen/MachineFunctionPass.h"
23+
#include "llvm/CodeGen/VirtRegMap.h"
24+
#include "llvm/InitializePasses.h"
25+
26+
using namespace llvm;
27+
28+
#define DEBUG_TYPE "amdgpu-reserve-wwm-regs"
29+
30+
namespace {
31+
32+
class AMDGPUReserveWWMRegs : public MachineFunctionPass {
33+
public:
34+
static char ID;
35+
36+
AMDGPUReserveWWMRegs() : MachineFunctionPass(ID) {
37+
initializeAMDGPUReserveWWMRegsPass(*PassRegistry::getPassRegistry());
38+
}
39+
40+
bool runOnMachineFunction(MachineFunction &MF) override;
41+
42+
StringRef getPassName() const override {
43+
return "AMDGPU Reserve WWM Registers";
44+
}
45+
46+
void getAnalysisUsage(AnalysisUsage &AU) const override {
47+
AU.setPreservesAll();
48+
MachineFunctionPass::getAnalysisUsage(AU);
49+
}
50+
};
51+
52+
} // End anonymous namespace.
53+
54+
INITIALIZE_PASS(AMDGPUReserveWWMRegs, DEBUG_TYPE,
55+
"AMDGPU Reserve WWM Registers", false, false)
56+
57+
char AMDGPUReserveWWMRegs::ID = 0;
58+
59+
char &llvm::AMDGPUReserveWWMRegsID = AMDGPUReserveWWMRegs::ID;
60+
61+
bool AMDGPUReserveWWMRegs::runOnMachineFunction(MachineFunction &MF) {
62+
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
63+
64+
bool Changed = false;
65+
for (MachineBasicBlock &MBB : MF) {
66+
for (MachineInstr &MI : MBB) {
67+
unsigned Opc = MI.getOpcode();
68+
if (Opc != AMDGPU::SI_SPILL_S32_TO_VGPR &&
69+
Opc != AMDGPU::SI_RESTORE_S32_FROM_VGPR)
70+
continue;
71+
72+
Register Reg = Opc == AMDGPU::SI_SPILL_S32_TO_VGPR
73+
? MI.getOperand(0).getReg()
74+
: MI.getOperand(1).getReg();
75+
76+
assert(Reg.isPhysical() &&
77+
"All WWM registers should have been allocated by now.");
78+
79+
MFI->reserveWWMRegister(Reg);
80+
Changed |= true;
81+
}
82+
}
83+
84+
// The renamable flag can't be set for reserved registers. Reset the flag for
85+
// MOs involving wwm-regs as they will be reserved during vgpr-regalloc
86+
// pipeline.
87+
const MachineRegisterInfo &MRI = MF.getRegInfo();
88+
for (Register Reg : MFI->getWWMReservedRegs()) {
89+
for (MachineOperand &MO : MRI.reg_operands(Reg))
90+
MO.setIsRenamable(false);
91+
}
92+
93+
// Now clear the NonWWMRegMask earlier set during wwm-regalloc.
94+
MFI->clearNonWWMRegAllocMask();
95+
96+
return Changed;
97+
}

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 91 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,12 @@ class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> {
8181
: RegisterRegAllocBase(N, D, C) {}
8282
};
8383

84+
class WWMRegisterRegAlloc : public RegisterRegAllocBase<WWMRegisterRegAlloc> {
85+
public:
86+
WWMRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
87+
: RegisterRegAllocBase(N, D, C) {}
88+
};
89+
8490
static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI,
8591
const MachineRegisterInfo &MRI,
8692
const Register Reg) {
@@ -95,13 +101,24 @@ static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI,
95101
return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC);
96102
}
97103

98-
/// -{sgpr|vgpr}-regalloc=... command line option.
104+
static bool onlyAllocateWWMRegs(const TargetRegisterInfo &TRI,
105+
const MachineRegisterInfo &MRI,
106+
const Register Reg) {
107+
const SIMachineFunctionInfo *MFI =
108+
MRI.getMF().getInfo<SIMachineFunctionInfo>();
109+
const TargetRegisterClass *RC = MRI.getRegClass(Reg);
110+
return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC) &&
111+
MFI->checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG);
112+
}
113+
114+
/// -{sgpr|wwm|vgpr}-regalloc=... command line option.
99115
static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
100116

101117
/// A dummy default pass factory indicates whether the register allocator is
102118
/// overridden on the command line.
103119
static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag;
104120
static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag;
121+
static llvm::once_flag InitializeDefaultWWMRegisterAllocatorFlag;
105122

106123
static SGPRRegisterRegAlloc
107124
defaultSGPRRegAlloc("default",
@@ -118,6 +135,11 @@ static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false,
118135
VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
119136
cl::desc("Register allocator to use for VGPRs"));
120137

138+
static cl::opt<WWMRegisterRegAlloc::FunctionPassCtor, false,
139+
RegisterPassParser<WWMRegisterRegAlloc>>
140+
WWMRegAlloc("wwm-regalloc", cl::Hidden,
141+
cl::init(&useDefaultRegisterAllocator),
142+
cl::desc("Register allocator to use for WWM registers"));
121143

122144
static void initializeDefaultSGPRRegisterAllocatorOnce() {
123145
RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
@@ -137,6 +159,15 @@ static void initializeDefaultVGPRRegisterAllocatorOnce() {
137159
}
138160
}
139161

162+
static void initializeDefaultWWMRegisterAllocatorOnce() {
163+
RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault();
164+
165+
if (!Ctor) {
166+
Ctor = WWMRegAlloc;
167+
WWMRegisterRegAlloc::setDefault(WWMRegAlloc);
168+
}
169+
}
170+
140171
static FunctionPass *createBasicSGPRRegisterAllocator() {
141172
return createBasicRegisterAllocator(onlyAllocateSGPRs);
142173
}
@@ -161,6 +192,18 @@ static FunctionPass *createFastVGPRRegisterAllocator() {
161192
return createFastRegisterAllocator(onlyAllocateVGPRs, true);
162193
}
163194

195+
static FunctionPass *createBasicWWMRegisterAllocator() {
196+
return createBasicRegisterAllocator(onlyAllocateWWMRegs);
197+
}
198+
199+
static FunctionPass *createGreedyWWMRegisterAllocator() {
200+
return createGreedyRegisterAllocator(onlyAllocateWWMRegs);
201+
}
202+
203+
static FunctionPass *createFastWWMRegisterAllocator() {
204+
return createFastRegisterAllocator(onlyAllocateWWMRegs, false);
205+
}
206+
164207
static SGPRRegisterRegAlloc basicRegAllocSGPR(
165208
"basic", "basic register allocator", createBasicSGPRRegisterAllocator);
166209
static SGPRRegisterRegAlloc greedyRegAllocSGPR(
@@ -177,7 +220,16 @@ static VGPRRegisterRegAlloc greedyRegAllocVGPR(
177220

178221
static VGPRRegisterRegAlloc fastRegAllocVGPR(
179222
"fast", "fast register allocator", createFastVGPRRegisterAllocator);
180-
}
223+
224+
static WWMRegisterRegAlloc basicRegAllocWWMReg("basic",
225+
"basic register allocator",
226+
createBasicWWMRegisterAllocator);
227+
static WWMRegisterRegAlloc
228+
greedyRegAllocWWMReg("greedy", "greedy register allocator",
229+
createGreedyWWMRegisterAllocator);
230+
static WWMRegisterRegAlloc fastRegAllocWWMReg("fast", "fast register allocator",
231+
createFastWWMRegisterAllocator);
232+
} // anonymous namespace
181233

182234
static cl::opt<bool>
183235
EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
@@ -425,6 +477,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
425477
initializeAMDGPURemoveIncompatibleFunctionsPass(*PR);
426478
initializeAMDGPUSwLowerLDSLegacyPass(*PR);
427479
initializeAMDGPULowerModuleLDSLegacyPass(*PR);
480+
initializeAMDGPUReserveWWMRegsPass(*PR);
428481
initializeAMDGPURewriteOutArgumentsPass(*PR);
429482
initializeAMDGPURewriteUndefForPHILegacyPass(*PR);
430483
initializeAMDGPUUnifyMetadataPass(*PR);
@@ -1014,6 +1067,7 @@ class GCNPassConfig final : public AMDGPUPassConfig {
10141067

10151068
FunctionPass *createSGPRAllocPass(bool Optimized);
10161069
FunctionPass *createVGPRAllocPass(bool Optimized);
1070+
FunctionPass *createWWMRegAllocPass(bool Optimized);
10171071
FunctionPass *createRegAllocPass(bool Optimized) override;
10181072

10191073
bool addRegAssignAndRewriteFast() override;
@@ -1410,7 +1464,6 @@ void GCNPassConfig::addOptimizedRegAlloc() {
14101464
}
14111465

14121466
bool GCNPassConfig::addPreRewrite() {
1413-
addPass(&SILowerWWMCopiesID);
14141467
if (EnableRegReassign)
14151468
addPass(&GCNNSAReassignID);
14161469
return true;
@@ -1446,12 +1499,28 @@ FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) {
14461499
return createFastVGPRRegisterAllocator();
14471500
}
14481501

1502+
FunctionPass *GCNPassConfig::createWWMRegAllocPass(bool Optimized) {
1503+
// Initialize the global default.
1504+
llvm::call_once(InitializeDefaultWWMRegisterAllocatorFlag,
1505+
initializeDefaultWWMRegisterAllocatorOnce);
1506+
1507+
RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault();
1508+
if (Ctor != useDefaultRegisterAllocator)
1509+
return Ctor();
1510+
1511+
if (Optimized)
1512+
return createGreedyWWMRegisterAllocator();
1513+
1514+
return createFastWWMRegisterAllocator();
1515+
}
1516+
14491517
FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) {
14501518
llvm_unreachable("should not be used");
14511519
}
14521520

14531521
static const char RegAllocOptNotSupportedMessage[] =
1454-
"-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc";
1522+
"-regalloc not supported with amdgcn. Use -sgpr-regalloc, -wwm-regalloc, "
1523+
"and -vgpr-regalloc";
14551524

14561525
bool GCNPassConfig::addRegAssignAndRewriteFast() {
14571526
if (!usingDefaultRegAlloc())
@@ -1463,11 +1532,19 @@ bool GCNPassConfig::addRegAssignAndRewriteFast() {
14631532

14641533
// Equivalent of PEI for SGPRs.
14651534
addPass(&SILowerSGPRSpillsID);
1535+
1536+
// To Allocate wwm registers used in whole quad mode operations (for shaders).
14661537
addPass(&SIPreAllocateWWMRegsID);
14671538

1468-
addPass(createVGPRAllocPass(false));
1539+
// For allocating other wwm register operands.
1540+
addPass(createWWMRegAllocPass(false));
14691541

14701542
addPass(&SILowerWWMCopiesID);
1543+
addPass(&AMDGPUReserveWWMRegsID);
1544+
1545+
// For allocating regular VGPRs.
1546+
addPass(createVGPRAllocPass(false));
1547+
14711548
return true;
14721549
}
14731550

@@ -1487,8 +1564,17 @@ bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
14871564

14881565
// Equivalent of PEI for SGPRs.
14891566
addPass(&SILowerSGPRSpillsID);
1567+
1568+
// To Allocate wwm registers used in whole quad mode operations (for shaders).
14901569
addPass(&SIPreAllocateWWMRegsID);
14911570

1571+
// For allocating other whole wave mode registers.
1572+
addPass(createWWMRegAllocPass(true));
1573+
addPass(&SILowerWWMCopiesID);
1574+
addPass(createVirtRegRewriter(false));
1575+
addPass(&AMDGPUReserveWWMRegsID);
1576+
1577+
// For allocating regular VGPRs.
14921578
addPass(createVGPRAllocPass(true));
14931579

14941580
addPreRewrite();

llvm/lib/Target/AMDGPU/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ add_llvm_target(AMDGPUCodeGen
9595
AMDGPURegBankSelect.cpp
9696
AMDGPURegisterBankInfo.cpp
9797
AMDGPURemoveIncompatibleFunctions.cpp
98+
AMDGPUReserveWWMRegs.cpp
9899
AMDGPUResourceUsageAnalysis.cpp
99100
AMDGPURewriteOutArguments.cpp
100101
AMDGPURewriteUndefForPHI.cpp

0 commit comments

Comments
 (0)