Skip to content

Commit 8f3e78f

Browse files
authored
AMDGPU: Add pass to replace constant materialize with AV pseudos (#149292)
If we have a v_mov_b32 or v_accvgpr_write_b32 with an inline immediate, replace it with a pseudo which writes to the combined AV_* class. This relaxes the operand constraints, which will allow the allocator to inflate the register class to AV_* to potentially avoid spilling. The allocator does not know how to replace an instruction to enable the change of register class. I originally tried to do this by changing all of the places we introduce v_mov_b32 with immediate, but it's along tail of niche cases that require manual updating. Plus we can restrict this to only run on functions where we know we will be allocating AGPRs.
1 parent a961210 commit 8f3e78f

19 files changed

+330
-83
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,9 @@ struct AMDGPULowerBufferFatPointersPass
153153
const TargetMachine &TM;
154154
};
155155

156+
void initializeAMDGPUPrepareAGPRAllocLegacyPass(PassRegistry &);
157+
extern char &AMDGPUPrepareAGPRAllocLegacyID;
158+
156159
void initializeAMDGPUReserveWWMRegsLegacyPass(PassRegistry &);
157160
extern char &AMDGPUReserveWWMRegsLegacyID;
158161

llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ MACHINE_FUNCTION_PASS("amdgpu-rewrite-partial-reg-uses", GCNRewritePartialRegUse
114114
MACHINE_FUNCTION_PASS("amdgpu-set-wave-priority", AMDGPUSetWavePriorityPass())
115115
MACHINE_FUNCTION_PASS("amdgpu-pre-ra-optimizations", GCNPreRAOptimizationsPass())
116116
MACHINE_FUNCTION_PASS("amdgpu-preload-kern-arg-prolog", AMDGPUPreloadKernArgPrologPass())
117+
MACHINE_FUNCTION_PASS("amdgpu-prepare-agpr-alloc", AMDGPUPrepareAGPRAllocPass())
117118
MACHINE_FUNCTION_PASS("amdgpu-nsa-reassign", GCNNSAReassignPass())
118119
MACHINE_FUNCTION_PASS("amdgpu-wait-sgpr-hazards", AMDGPUWaitSGPRHazardsPass())
119120
MACHINE_FUNCTION_PASS("gcn-create-vopd", GCNCreateVOPDPass())
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
//===-- AMDGPUPrepareAGPRAlloc.cpp ----------------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
// Make simple transformations to relax register constraints for cases which can
10+
// allocate to AGPRs or VGPRs. Replace materialize of inline immediates into
11+
// AGPR or VGPR with a pseudo with an AV_* class register constraint. This
12+
// allows later passes to inflate the register class if necessary. The register
13+
// allocator does not know to replace instructions to relax constraints.
14+
//
15+
//===----------------------------------------------------------------------===//
16+
17+
#include "AMDGPUPrepareAGPRAlloc.h"
18+
#include "AMDGPU.h"
19+
#include "GCNSubtarget.h"
20+
#include "SIMachineFunctionInfo.h"
21+
#include "SIRegisterInfo.h"
22+
#include "llvm/CodeGen/LiveIntervals.h"
23+
#include "llvm/CodeGen/MachineFunctionPass.h"
24+
#include "llvm/InitializePasses.h"
25+
26+
using namespace llvm;
27+
28+
#define DEBUG_TYPE "amdgpu-prepare-agpr-alloc"
29+
30+
namespace {
31+
32+
class AMDGPUPrepareAGPRAllocImpl {
33+
private:
34+
const SIInstrInfo &TII;
35+
MachineRegisterInfo &MRI;
36+
37+
public:
38+
AMDGPUPrepareAGPRAllocImpl(const GCNSubtarget &ST, MachineRegisterInfo &MRI)
39+
: TII(*ST.getInstrInfo()), MRI(MRI) {}
40+
bool run(MachineFunction &MF);
41+
};
42+
43+
class AMDGPUPrepareAGPRAllocLegacy : public MachineFunctionPass {
44+
public:
45+
static char ID;
46+
47+
AMDGPUPrepareAGPRAllocLegacy() : MachineFunctionPass(ID) {
48+
initializeAMDGPUPrepareAGPRAllocLegacyPass(
49+
*PassRegistry::getPassRegistry());
50+
}
51+
52+
bool runOnMachineFunction(MachineFunction &MF) override;
53+
54+
StringRef getPassName() const override { return "AMDGPU Prepare AGPR Alloc"; }
55+
56+
void getAnalysisUsage(AnalysisUsage &AU) const override {
57+
AU.setPreservesAll();
58+
MachineFunctionPass::getAnalysisUsage(AU);
59+
}
60+
};
61+
} // End anonymous namespace.
62+
63+
INITIALIZE_PASS_BEGIN(AMDGPUPrepareAGPRAllocLegacy, DEBUG_TYPE,
64+
"AMDGPU Prepare AGPR Alloc", false, false)
65+
INITIALIZE_PASS_END(AMDGPUPrepareAGPRAllocLegacy, DEBUG_TYPE,
66+
"AMDGPU Prepare AGPR Alloc", false, false)
67+
68+
char AMDGPUPrepareAGPRAllocLegacy::ID = 0;
69+
70+
char &llvm::AMDGPUPrepareAGPRAllocLegacyID = AMDGPUPrepareAGPRAllocLegacy::ID;
71+
72+
bool AMDGPUPrepareAGPRAllocLegacy::runOnMachineFunction(MachineFunction &MF) {
73+
if (skipFunction(MF.getFunction()))
74+
return false;
75+
76+
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
77+
return AMDGPUPrepareAGPRAllocImpl(ST, MF.getRegInfo()).run(MF);
78+
}
79+
80+
PreservedAnalyses
81+
AMDGPUPrepareAGPRAllocPass::run(MachineFunction &MF,
82+
MachineFunctionAnalysisManager &MFAM) {
83+
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
84+
AMDGPUPrepareAGPRAllocImpl(ST, MF.getRegInfo()).run(MF);
85+
return PreservedAnalyses::all();
86+
}
87+
88+
bool AMDGPUPrepareAGPRAllocImpl::run(MachineFunction &MF) {
89+
if (MRI.isReserved(AMDGPU::AGPR0))
90+
return false;
91+
92+
const MCInstrDesc &AVImmPseudo = TII.get(AMDGPU::AV_MOV_B32_IMM_PSEUDO);
93+
94+
bool Changed = false;
95+
for (MachineBasicBlock &MBB : MF) {
96+
for (MachineInstr &MI : MBB) {
97+
if ((MI.getOpcode() == AMDGPU::V_MOV_B32_e32 &&
98+
TII.isInlineConstant(MI, 1)) ||
99+
(MI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
100+
MI.getOperand(1).isImm())) {
101+
MI.setDesc(AVImmPseudo);
102+
Changed = true;
103+
}
104+
}
105+
}
106+
107+
return Changed;
108+
}
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
//===- AMDGPUPrepareAGPRAlloc.h ---------------------------------*- C++- *-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUPREPAREAGPRALLOC_H
10+
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUPREPAREAGPRALLOC_H
11+
12+
#include "llvm/CodeGen/MachinePassManager.h"
13+
14+
namespace llvm {
15+
class AMDGPUPrepareAGPRAllocPass
16+
: public PassInfoMixin<AMDGPUPrepareAGPRAllocPass> {
17+
public:
18+
PreservedAnalyses run(MachineFunction &MF,
19+
MachineFunctionAnalysisManager &MFAM);
20+
};
21+
} // namespace llvm
22+
23+
#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUPREPAREAGPRALLOC_H

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include "AMDGPUMacroFusion.h"
2626
#include "AMDGPUPerfHintAnalysis.h"
2727
#include "AMDGPUPreloadKernArgProlog.h"
28+
#include "AMDGPUPrepareAGPRAlloc.h"
2829
#include "AMDGPURemoveIncompatibleFunctions.h"
2930
#include "AMDGPUReserveWWMRegs.h"
3031
#include "AMDGPUResourceUsageAnalysis.h"
@@ -499,6 +500,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
499500
initializeGlobalISel(*PR);
500501
initializeAMDGPUAsmPrinterPass(*PR);
501502
initializeAMDGPUDAGToDAGISelLegacyPass(*PR);
503+
initializeAMDGPUPrepareAGPRAllocLegacyPass(*PR);
502504
initializeGCNDPPCombineLegacyPass(*PR);
503505
initializeSILowerI1CopiesLegacyPass(*PR);
504506
initializeAMDGPUGlobalISelDivergenceLoweringPass(*PR);
@@ -1196,6 +1198,7 @@ class GCNPassConfig final : public AMDGPUPassConfig {
11961198
bool addRegBankSelect() override;
11971199
void addPreGlobalInstructionSelect() override;
11981200
bool addGlobalInstructionSelect() override;
1201+
void addPreRegAlloc() override;
11991202
void addFastRegAlloc() override;
12001203
void addOptimizedRegAlloc() override;
12011204

@@ -1539,6 +1542,11 @@ void GCNPassConfig::addFastRegAlloc() {
15391542
TargetPassConfig::addFastRegAlloc();
15401543
}
15411544

1545+
void GCNPassConfig::addPreRegAlloc() {
1546+
if (getOptLevel() != CodeGenOptLevel::None)
1547+
addPass(&AMDGPUPrepareAGPRAllocLegacyID);
1548+
}
1549+
15421550
void GCNPassConfig::addOptimizedRegAlloc() {
15431551
if (EnableDCEInRA)
15441552
insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID);
@@ -2235,6 +2243,11 @@ void AMDGPUCodeGenPassBuilder::addOptimizedRegAlloc(
22352243
Base::addOptimizedRegAlloc(addPass);
22362244
}
22372245

2246+
void AMDGPUCodeGenPassBuilder::addPreRegAlloc(AddMachinePass &addPass) const {
2247+
if (getOptLevel() != CodeGenOptLevel::None)
2248+
addPass(AMDGPUPrepareAGPRAllocPass());
2249+
}
2250+
22382251
Error AMDGPUCodeGenPassBuilder::addRegAssignmentOptimized(
22392252
AddMachinePass &addPass) const {
22402253
// TODO: Check --regalloc-npm option

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,9 @@ class AMDGPUCodeGenPassBuilder
181181
void addMachineSSAOptimization(AddMachinePass &) const;
182182
void addPostRegAlloc(AddMachinePass &) const;
183183
void addPreEmitPass(AddMachinePass &) const;
184+
void addPreEmitRegAlloc(AddMachinePass &) const;
184185
Error addRegAssignmentOptimized(AddMachinePass &) const;
186+
void addPreRegAlloc(AddMachinePass &) const;
185187
void addOptimizedRegAlloc(AddMachinePass &) const;
186188
void addPreSched2(AddMachinePass &) const;
187189

llvm/lib/Target/AMDGPU/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ add_llvm_target(AMDGPUCodeGen
7474
AMDGPULowerKernelArguments.cpp
7575
AMDGPULowerKernelAttributes.cpp
7676
AMDGPULowerModuleLDSPass.cpp
77+
AMDGPUPrepareAGPRAlloc.cpp
7778
AMDGPUSwLowerLDS.cpp
7879
AMDGPUMachineFunction.cpp
7980
AMDGPUMachineModuleInfo.cpp

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1113,7 +1113,6 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
11131113
// that will not require an additional 4-bytes; this function assumes that it
11141114
// will.
11151115
bool isInlineConstant(const MachineOperand &MO, uint8_t OperandType) const {
1116-
assert(!MO.isReg() && "isInlineConstant called on register operand!");
11171116
if (!MO.isImm())
11181117
return false;
11191118
return isInlineConstant(MO.getImm(), OperandType);

llvm/test/CodeGen/AMDGPU/agpr-remat.ll

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,17 +6,17 @@
66
define amdgpu_kernel void @remat_constant_voids_spill(ptr addrspace(1) %p) #1 {
77
; GFX908-LABEL: remat_constant_voids_spill:
88
; GFX908: ; %bb.0:
9-
; GFX908-NEXT: v_accvgpr_write_b32 a1, 1
10-
; GFX908-NEXT: v_accvgpr_write_b32 a5, 6
11-
; GFX908-NEXT: v_accvgpr_write_b32 a6, 7
12-
; GFX908-NEXT: v_accvgpr_write_b32 a7, 8
13-
; GFX908-NEXT: v_accvgpr_write_b32 a0, 9
14-
; GFX908-NEXT: v_accvgpr_write_b32 a2, 2
15-
; GFX908-NEXT: v_accvgpr_write_b32 a3, 3
16-
; GFX908-NEXT: v_accvgpr_write_b32 a4, 4
9+
; GFX908-NEXT: v_accvgpr_write_b32 a0, 1
10+
; GFX908-NEXT: v_accvgpr_write_b32 a1, 2
11+
; GFX908-NEXT: v_accvgpr_write_b32 a2, 3
12+
; GFX908-NEXT: v_accvgpr_write_b32 a3, 4
1713
; GFX908-NEXT: ;;#ASMSTART
1814
; GFX908-NEXT: ;;#ASMEND
19-
; GFX908-NEXT: v_accvgpr_write_b32 a1, 5
15+
; GFX908-NEXT: v_accvgpr_write_b32 a0, 5
16+
; GFX908-NEXT: v_accvgpr_write_b32 a1, 6
17+
; GFX908-NEXT: v_accvgpr_write_b32 a2, 7
18+
; GFX908-NEXT: v_accvgpr_write_b32 a3, 8
19+
; GFX908-NEXT: v_accvgpr_write_b32 a4, 9
2020
; GFX908-NEXT: ;;#ASMSTART
2121
; GFX908-NEXT: ;;#ASMEND
2222
; GFX908-NEXT: s_endpgm
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
2+
# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass=amdgpu-prepare-agpr-alloc -o - %s | FileCheck -check-prefixes=HAS-AGPR,GFX90A %s
3+
# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -run-pass=amdgpu-prepare-agpr-alloc -o - %s | FileCheck -check-prefixes=HAS-AGPR,GFX908 %s
4+
# RUN: llc -mtriple=amdgcn -mcpu=gfx906 -passes=amdgpu-prepare-agpr-alloc -o - %s | FileCheck -check-prefix=NO-AGPR %s
5+
6+
--- |
7+
define void @func() {
8+
ret void
9+
}
10+
11+
; Attribute is ignored for gfx90a
12+
define void @no_agprs() "amdgpu-agpr-alloc"="0,0" {
13+
ret void
14+
}
15+
16+
...
17+
---
18+
name: func
19+
tracksRegLiveness: true
20+
stack:
21+
- { id: 0, size: 4 }
22+
body: |
23+
; HAS-AGPR-LABEL: name: func
24+
; HAS-AGPR: bb.0:
25+
; HAS-AGPR-NEXT: successors: %bb.1(0x80000000)
26+
; HAS-AGPR-NEXT: liveins: $vgpr0
27+
; HAS-AGPR-NEXT: {{ $}}
28+
; HAS-AGPR-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
29+
; HAS-AGPR-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
30+
; HAS-AGPR-NEXT: [[AV_MOV_:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 1, implicit $exec
31+
; HAS-AGPR-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65, implicit $exec
32+
; HAS-AGPR-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
33+
; HAS-AGPR-NEXT: [[AV_MOV_1:%[0-9]+]]:agpr_32 = AV_MOV_B32_IMM_PSEUDO 2, implicit $exec
34+
; HAS-AGPR-NEXT: [[AV_MOV_2:%[0-9]+]]:agpr_32 = AV_MOV_B32_IMM_PSEUDO 6, implicit $exec
35+
; HAS-AGPR-NEXT: {{ $}}
36+
; HAS-AGPR-NEXT: bb.1:
37+
; HAS-AGPR-NEXT: [[AV_MOV_3:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 3, implicit $exec
38+
;
39+
; NO-AGPR-LABEL: name: func
40+
; NO-AGPR: bb.0:
41+
; NO-AGPR-NEXT: successors: %bb.1(0x80000000)
42+
; NO-AGPR-NEXT: liveins: $vgpr0
43+
; NO-AGPR-NEXT: {{ $}}
44+
; NO-AGPR-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
45+
; NO-AGPR-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
46+
; NO-AGPR-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
47+
; NO-AGPR-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65, implicit $exec
48+
; NO-AGPR-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
49+
; NO-AGPR-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 2, implicit $exec
50+
; NO-AGPR-NEXT: [[V_ACCVGPR_WRITE_B32_e64_2:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 6, implicit $exec
51+
; NO-AGPR-NEXT: {{ $}}
52+
; NO-AGPR-NEXT: bb.1:
53+
; NO-AGPR-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3, implicit $exec
54+
bb.0:
55+
liveins: $vgpr0
56+
%0:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
57+
%1:agpr_32 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
58+
%2:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
59+
%3:vgpr_32 = V_MOV_B32_e32 65, implicit $exec
60+
%4:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
61+
%5:agpr_32 = V_ACCVGPR_WRITE_B32_e64 2, implicit $exec
62+
%6:agpr_32 = V_ACCVGPR_WRITE_B32_e64 6, implicit $exec
63+
64+
bb.1:
65+
%7:vgpr_32 = V_MOV_B32_e32 3, implicit $exec
66+
67+
...
68+
69+
---
70+
name: no_agprs
71+
tracksRegLiveness: true
72+
body: |
73+
bb.0:
74+
liveins: $vgpr0
75+
; GFX90A-LABEL: name: no_agprs
76+
; GFX90A: liveins: $vgpr0
77+
; GFX90A-NEXT: {{ $}}
78+
; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
79+
; GFX90A-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 2, implicit $exec
80+
;
81+
; GFX908-LABEL: name: no_agprs
82+
; GFX908: liveins: $vgpr0
83+
; GFX908-NEXT: {{ $}}
84+
; GFX908-NEXT: [[AV_MOV_:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 1, implicit $exec
85+
; GFX908-NEXT: [[AV_MOV_1:%[0-9]+]]:agpr_32 = AV_MOV_B32_IMM_PSEUDO 2, implicit $exec
86+
;
87+
; NO-AGPR-LABEL: name: no_agprs
88+
; NO-AGPR: liveins: $vgpr0
89+
; NO-AGPR-NEXT: {{ $}}
90+
; NO-AGPR-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
91+
; NO-AGPR-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 2, implicit $exec
92+
%0:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
93+
%1:agpr_32 = V_ACCVGPR_WRITE_B32_e64 2, implicit $exec
94+
95+
...

0 commit comments

Comments
 (0)