Skip to content

Commit eefad74

Browse files
authored
AMDGPU: Handle rewriting VGPR MFMA to AGPR with subregister copies (llvm#153019)
This should address the case where the result isn't fully used, resulting in partial copy bundles from the MFMA result.
1 parent fd28257 commit eefad74

6 files changed

+228
-278
lines changed

llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp

Lines changed: 47 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,6 @@
1414
/// MFMA opcode.
1515
///
1616
/// TODO:
17-
/// - Handle SplitKit partial copy bundles, and not just full copy instructions
18-
///
1917
/// - Update LiveIntervals incrementally instead of recomputing from scratch
2018
///
2119
//===----------------------------------------------------------------------===//
@@ -37,6 +35,7 @@ using namespace llvm;
3735
namespace {
3836

3937
class AMDGPURewriteAGPRCopyMFMAImpl {
38+
MachineFunction &MF;
4039
const GCNSubtarget &ST;
4140
const SIInstrInfo &TII;
4241
const SIRegisterInfo &TRI;
@@ -53,7 +52,7 @@ class AMDGPURewriteAGPRCopyMFMAImpl {
5352
AMDGPURewriteAGPRCopyMFMAImpl(MachineFunction &MF, VirtRegMap &VRM,
5453
LiveRegMatrix &LRM, LiveIntervals &LIS,
5554
const RegisterClassInfo &RegClassInfo)
56-
: ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
55+
: MF(MF), ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
5756
TRI(*ST.getRegisterInfo()), MRI(MF.getRegInfo()), VRM(VRM), LRM(LRM),
5857
LIS(LIS), RegClassInfo(RegClassInfo) {}
5958

@@ -71,26 +70,28 @@ class AMDGPURewriteAGPRCopyMFMAImpl {
7170
///
7271
/// \p RewriteRegs will accumulate the set of register used by those MFMAs
7372
/// that need to have the register classes adjusted.
74-
const TargetRegisterClass *recomputeRegClassExceptRewritable(
75-
Register Reg, const TargetRegisterClass *OldRC,
76-
const TargetRegisterClass *NewRC,
77-
SmallVectorImpl<MachineInstr *> &RewriteCandidates,
73+
bool recomputeRegClassExceptRewritable(
74+
Register Reg, SmallVectorImpl<MachineInstr *> &RewriteCandidates,
7875
SmallSetVector<Register, 4> &RewriteRegs) const;
7976

8077
bool run(MachineFunction &MF) const;
8178
};
8279

83-
const TargetRegisterClass *
84-
AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExceptRewritable(
85-
Register StartReg, const TargetRegisterClass *OldRC,
86-
const TargetRegisterClass *NewRC,
87-
SmallVectorImpl<MachineInstr *> &RewriteCandidates,
80+
bool AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExceptRewritable(
81+
Register StartReg, SmallVectorImpl<MachineInstr *> &RewriteCandidates,
8882
SmallSetVector<Register, 4> &RewriteRegs) const {
8983
SmallVector<Register, 8> Worklist = {StartReg};
9084

9185
// Recursively visit all transitive MFMA users
9286
while (!Worklist.empty()) {
9387
Register Reg = Worklist.pop_back_val();
88+
const TargetRegisterClass *OldRC = MRI.getRegClass(Reg);
89+
90+
// Inflate to the equivalent AV_* class.
91+
const TargetRegisterClass *NewRC = TRI.getLargestLegalSuperClass(OldRC, MF);
92+
if (OldRC == NewRC)
93+
return false;
94+
9495
// Accumulate constraints from all uses.
9596
for (MachineOperand &MO : MRI.reg_nodbg_operands(Reg)) {
9697
// Apply the effect of the given operand to NewRC.
@@ -101,23 +102,40 @@ AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExceptRewritable(
101102
// either AGPR or VGPR in src0/src1, so don't bother checking the
102103
// constraint effects of the individual operands.
103104
if (isRewriteCandidate(*MI)) {
104-
for (AMDGPU::OpName OpName :
105-
{AMDGPU::OpName::vdst, AMDGPU::OpName::src2}) {
106-
const MachineOperand *Op = TII.getNamedOperand(*MI, OpName);
105+
const MachineOperand *VDst =
106+
TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
107+
const MachineOperand *Src2 =
108+
TII.getNamedOperand(*MI, AMDGPU::OpName::src2);
109+
for (const MachineOperand *Op : {VDst, Src2}) {
107110
if (!Op->isReg())
108111
continue;
109112

110113
Register OtherReg = Op->getReg();
111-
if (OtherReg != Reg) {
112-
if (RewriteRegs.insert(OtherReg))
113-
Worklist.push_back(OtherReg);
114-
}
114+
if (OtherReg.isPhysical())
115+
return false;
116+
117+
if (OtherReg != Reg && RewriteRegs.insert(OtherReg))
118+
Worklist.push_back(OtherReg);
115119
}
116120

117-
LLVM_DEBUG(dbgs() << "Ignoring effects of " << *MI);
121+
if (!is_contained(RewriteCandidates, MI)) {
122+
LLVM_DEBUG({
123+
Register VDstPhysReg = VRM.getPhys(VDst->getReg());
124+
dbgs() << "Attempting to replace VGPR MFMA with AGPR version:"
125+
<< " Dst=[" << printReg(VDst->getReg()) << " => "
126+
<< printReg(VDstPhysReg, &TRI);
127+
128+
if (Src2->isReg()) {
129+
Register Src2PhysReg = VRM.getPhys(Src2->getReg());
130+
dbgs() << "], Src2=[" << printReg(Src2->getReg(), &TRI) << " => "
131+
<< printReg(Src2PhysReg, &TRI);
132+
}
133+
134+
dbgs() << "]: " << MI;
135+
});
118136

119-
if (!is_contained(RewriteCandidates, MI))
120137
RewriteCandidates.push_back(MI);
138+
}
121139

122140
continue;
123141
}
@@ -126,13 +144,14 @@ AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExceptRewritable(
126144
NewRC = MI->getRegClassConstraintEffect(OpNo, NewRC, &TII, &TRI);
127145
if (!NewRC || NewRC == OldRC) {
128146
LLVM_DEBUG(dbgs() << "User of " << printReg(Reg, &TRI)
129-
<< " cannot be reassigned to AGPR: " << *MI);
130-
return nullptr;
147+
<< " cannot be reassigned to "
148+
<< TRI.getRegClassName(NewRC) << ": " << *MI);
149+
return false;
131150
}
132151
}
133152
}
134153

135-
return NewRC;
154+
return true;
136155
}
137156

138157
/// Attempt to reassign the registers in \p InterferingRegs to be AGPRs, with a
@@ -228,10 +247,7 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
228247
continue;
229248

230249
MachineInstr *DefMI = LIS.getInstructionFromIndex(VNI->def);
231-
232-
// TODO: Handle SplitKit produced copy bundles for partially defined
233-
// registers.
234-
if (!DefMI || !DefMI->isFullCopy())
250+
if (!DefMI || !DefMI->isCopy())
235251
continue;
236252

237253
Register MFMADstReg = DefMI->getOperand(1).getReg();
@@ -244,34 +260,6 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
244260
if (!MFMA || !isRewriteCandidate(*MFMA))
245261
continue;
246262

247-
MachineOperand *Src2 = TII.getNamedOperand(*MFMA, AMDGPU::OpName::src2);
248-
Register Src2Reg;
249-
if (Src2->isReg()) {
250-
Src2Reg = Src2->getReg();
251-
if (!Src2Reg.isVirtual())
252-
continue;
253-
}
254-
255-
// FIXME: getMinimalPhysRegClass returns a nonsense AV_* subclass instead
256-
// of an AGPR or VGPR subclass, so we can't simply use the result on the
257-
// assignment.
258-
259-
LLVM_DEBUG({
260-
dbgs() << "Attempting to replace VGPR MFMA with AGPR version:"
261-
<< " Dst=[" << printReg(VReg) << " => "
262-
<< printReg(PhysReg, &TRI);
263-
264-
if (Src2Reg) {
265-
Register Src2PhysReg = VRM.getPhys(Src2Reg);
266-
dbgs() << "], Src2=[" << printReg(Src2Reg, &TRI) << " => "
267-
<< printReg(Src2PhysReg, &TRI);
268-
}
269-
270-
dbgs() << "]: " << *MFMA;
271-
});
272-
273-
const TargetRegisterClass *DstVirtRegRC = MRI.getRegClass(MFMADstReg);
274-
275263
// src2 and dst have the same physical class constraint; try to preserve
276264
// the original src2 subclass if one were to exist.
277265
SmallVector<MachineInstr *, 4> RewriteCandidates = {MFMA};
@@ -290,11 +278,9 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
290278
//
291279
// Note recomputeRegClassExceptRewritable will consider the constraints of
292280
// this MFMA's src2 as well as the src2/dst of any transitive MFMA users.
293-
const TargetRegisterClass *DstExceptRC =
294-
recomputeRegClassExceptRewritable(MFMADstReg, DstVirtRegRC, VirtRegRC,
295-
RewriteCandidates, RewriteRegs);
296-
if (!DstExceptRC) {
297-
LLVM_DEBUG(dbgs() << "Could not recompute the regclass of "
281+
if (!recomputeRegClassExceptRewritable(MFMADstReg, RewriteCandidates,
282+
RewriteRegs)) {
283+
LLVM_DEBUG(dbgs() << "Could not recompute the regclass of dst reg "
298284
<< printReg(MFMADstReg, &TRI) << '\n');
299285
continue;
300286
}

llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-agpr-negative-tests.mir

Lines changed: 0 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,6 @@
2020
ret void
2121
}
2222

23-
define amdgpu_kernel void @inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_src2_different_subreg() #0 {
24-
ret void
25-
}
26-
2723
define amdgpu_kernel void @inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_chain_no_agprs_first() #1 {
2824
ret void
2925
}
@@ -420,93 +416,6 @@ body: |
420416
421417
...
422418

423-
# Non-mac variant, src2 is the same VGPR, but a different subregister.
424-
---
425-
name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_src2_different_subreg
426-
tracksRegLiveness: true
427-
machineFunctionInfo:
428-
isEntryFunction: true
429-
stackPtrOffsetReg: '$sgpr32'
430-
occupancy: 10
431-
sgprForEXECCopy: '$sgpr100_sgpr101'
432-
body: |
433-
; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_src2_different_subreg
434-
; CHECK: bb.0:
435-
; CHECK-NEXT: successors: %bb.1(0x80000000)
436-
; CHECK-NEXT: {{ $}}
437-
; CHECK-NEXT: S_NOP 0, implicit-def $agpr0
438-
; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 0
439-
; CHECK-NEXT: renamable $vgpr8 = V_MOV_B32_e32 0, implicit $exec
440-
; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr0
441-
; CHECK-NEXT: renamable $vgpr18_vgpr19 = COPY killed renamable $sgpr0_sgpr1
442-
; CHECK-NEXT: renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
443-
; CHECK-NEXT: dead renamable $vgpr9 = COPY renamable $vgpr8
444-
; CHECK-NEXT: {{ $}}
445-
; CHECK-NEXT: bb.1:
446-
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
447-
; CHECK-NEXT: liveins: $vcc, $vgpr18_vgpr19
448-
; CHECK-NEXT: {{ $}}
449-
; CHECK-NEXT: renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
450-
; CHECK-NEXT: renamable $vgpr16_vgpr17 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
451-
; CHECK-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
452-
; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
453-
; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
454-
; CHECK-NEXT: S_BRANCH %bb.2
455-
; CHECK-NEXT: {{ $}}
456-
; CHECK-NEXT: bb.2:
457-
; CHECK-NEXT: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31:0x00000000FFFFFFFF
458-
; CHECK-NEXT: {{ $}}
459-
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
460-
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
461-
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
462-
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
463-
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
464-
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
465-
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
466-
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
467-
; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
468-
; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr8_agpr9_agpr10_agpr11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
469-
; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr12_agpr13_agpr14_agpr15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
470-
; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr0_agpr1_agpr2_agpr3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
471-
; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr0, killed renamable $agpr4_agpr5_agpr6_agpr7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
472-
; CHECK-NEXT: S_ENDPGM 0
473-
bb.0:
474-
S_NOP 0, implicit-def $agpr0
475-
renamable $sgpr0 = S_MOV_B32 0
476-
undef %0.sub8:vreg_1024_align2 = V_MOV_B32_e32 0, implicit $exec
477-
renamable $sgpr1 = COPY renamable $sgpr0
478-
%1:vreg_64_align2 = COPY killed renamable $sgpr0_sgpr1
479-
renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
480-
%0.sub9:vreg_1024_align2 = COPY %0.sub8
481-
482-
bb.1:
483-
liveins: $vcc
484-
485-
undef %0.sub0_sub1:vreg_1024_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
486-
%0.sub16_sub17:vreg_1024_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
487-
%0.sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15:vreg_1024_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0.sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23_sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31, 0, 0, 0, implicit $mode, implicit $exec
488-
S_CBRANCH_VCCNZ %bb.1, implicit $vcc
489-
S_BRANCH %bb.2
490-
491-
bb.2:
492-
; No VGPRs available for %0
493-
S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
494-
S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
495-
S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
496-
S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
497-
S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
498-
S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
499-
S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
500-
S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
501-
%2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
502-
GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub8_sub9_sub10_sub11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
503-
GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub12_sub13_sub14_sub15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
504-
GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub0_sub1_sub2_sub3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
505-
GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub4_sub5_sub6_sub7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
506-
S_ENDPGM 0
507-
508-
...
509-
510419
# There isn't an assignable AGPR around the first MFMA.
511420
---
512421
name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_chain_no_agprs_first

0 commit comments

Comments
 (0)