Skip to content

Commit 7eac9e4

Browse files
committed
AMDGPU: Handle multiple AGPR MFMA rewrites
Instead of ignoring the same user we started looking at, ignore uses of rewritable MFMA candidates.
1 parent e0eb8f0 commit 7eac9e4

File tree

3 files changed

+297
-21
lines changed

3 files changed

+297
-21
lines changed

llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp

Lines changed: 36 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -57,27 +57,38 @@ class AMDGPURewriteAGPRCopyMFMAImpl {
5757
TRI(*ST.getRegisterInfo()), MRI(MF.getRegInfo()), VRM(VRM), LRM(LRM),
5858
LIS(LIS) {}
5959

60+
bool isRewriteCandidate(const MachineInstr &MI) const {
61+
if (!TII.isMAI(MI))
62+
return false;
63+
return AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode()) != -1;
64+
}
65+
6066
/// Compute the register class constraints based on the uses of \p Reg,
6167
/// excluding uses from \p ExceptMI. This should be nearly identical to
6268
/// MachineRegisterInfo::recomputeRegClass.
6369
const TargetRegisterClass *
64-
recomputeRegClassExcept(Register Reg, const TargetRegisterClass *OldRC,
65-
const TargetRegisterClass *NewRC,
66-
const MachineInstr *ExceptMI) const;
70+
recomputeRegClassExceptRewritable(Register Reg,
71+
const TargetRegisterClass *OldRC,
72+
const TargetRegisterClass *NewRC) const;
6773

6874
bool run(MachineFunction &MF) const;
6975
};
7076

7177
const TargetRegisterClass *
72-
AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExcept(
78+
AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExceptRewritable(
7379
Register Reg, const TargetRegisterClass *OldRC,
74-
const TargetRegisterClass *NewRC, const MachineInstr *ExceptMI) const {
80+
const TargetRegisterClass *NewRC) const {
7581

7682
// Accumulate constraints from all uses.
7783
for (MachineOperand &MO : MRI.reg_nodbg_operands(Reg)) {
7884
// Apply the effect of the given operand to NewRC.
7985
MachineInstr *MI = MO.getParent();
80-
if (MI == ExceptMI)
86+
87+
// We can swap the classes of dst + src2 as a pair to AGPR, so ignore the
88+
// effects of rewrite candidates. It just so happens that we can use either
89+
// AGPR or VGPR in src0/src1, so don't bother checking the constraint
90+
// effects of the individual operands.
91+
if (isRewriteCandidate(*MI))
8192
continue;
8293

8394
unsigned OpNo = &MO - &MI->getOperand(0);
@@ -182,10 +193,13 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
182193
// first place, as well as need to assign another register, and need to
183194
// figure out where to put them. The live range splitting is smarter than
184195
// anything we're doing here, so trust it did something reasonable.
185-
const TargetRegisterClass *Src2ExceptRC = recomputeRegClassExcept(
186-
Src2->getReg(), Src2VirtRegRC, VirtRegRC, CopySrcMI);
187-
if (!Src2ExceptRC)
196+
const TargetRegisterClass *Src2ExceptRC =
197+
recomputeRegClassExceptRewritable(Src2->getReg(), Src2VirtRegRC,
198+
VirtRegRC);
199+
if (!Src2ExceptRC) {
200+
LLVM_DEBUG(dbgs() << "Could not recompute the regclass\n");
188201
continue;
202+
}
189203

190204
const TargetRegisterClass *NewSrc2ConstraintRC =
191205
TII.getRegClass(TII.get(AGPROp), Src2->getOperandNo(), &TRI, MF);
@@ -195,8 +209,6 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
195209
const TargetRegisterClass *NewSrc2RC =
196210
TRI.getCommonSubClass(Src2ExceptRC, NewSrc2ConstraintRC);
197211
if (!NewSrc2RC) {
198-
// TODO: This is ignoring ther rewritable uses. e.g. a rewritable MFMA
199-
// using a rewritable MFMA can be rewritten as a pair.
200212
LLVM_DEBUG(dbgs() << "Other uses of " << printReg(Src2->getReg(), &TRI)
201213
<< " are incompatible with replacement class\n");
202214
continue;
@@ -207,8 +219,19 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
207219

208220
CopySrcMI->setDesc(TII.get(AGPROp));
209221

210-
// TODO: Is replacing too aggressive, fixup these instructions only?
211-
MRI.replaceRegWith(CopySrcReg, VReg);
222+
// Perform replacement of the register, rewriting the rewritable uses.
223+
for (MachineInstr &UseMI :
224+
make_early_inc_range(MRI.reg_instructions(CopySrcReg))) {
225+
if (TII.isMAI(UseMI)) {
226+
// Note the register we need to rewrite may still appear in src0/src1,
227+
// but that's fine since those can use A or V anyway.
228+
int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(UseMI.getOpcode());
229+
if (ReplacementOp != -1)
230+
UseMI.setDesc(TII.get(ReplacementOp));
231+
}
232+
233+
UseMI.substituteRegister(CopySrcReg, VReg, AMDGPU::NoSubRegister, TRI);
234+
}
212235

213236
LLVM_DEBUG(dbgs() << "Replaced VGPR MFMA with AGPR: " << *CopySrcMI);
214237

llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-agpr-negative-tests.mir

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,10 @@
1616
ret void
1717
}
1818

19+
define amdgpu_kernel void @inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_mac_vgprcd_e64_two_chained_uses_cannot_rewrite_final_use() #0 {
20+
ret void
21+
}
22+
1923
attributes #0 = { "amdgpu-wave-limiter"="true" "amdgpu-waves-per-eu"="8,8" }
2024
...
2125

@@ -311,3 +315,81 @@ body: |
311315
$agpr0 = COPY %0
312316
313317
...
318+
319+
---
320+
name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_mac_vgprcd_e64_two_chained_uses_cannot_rewrite_final_use
321+
tracksRegLiveness: true
322+
machineFunctionInfo:
323+
isEntryFunction: true
324+
stackPtrOffsetReg: '$sgpr32'
325+
occupancy: 10
326+
sgprForEXECCopy: '$sgpr100_sgpr101'
327+
body: |
328+
; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_mac_vgprcd_e64_two_chained_uses_cannot_rewrite_final_use
329+
; CHECK: bb.0:
330+
; CHECK-NEXT: successors: %bb.1(0x80000000)
331+
; CHECK-NEXT: {{ $}}
332+
; CHECK-NEXT: S_NOP 0, implicit-def $agpr0
333+
; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 0
334+
; CHECK-NEXT: renamable $vgpr8 = V_MOV_B32_e32 0, implicit $exec
335+
; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr0
336+
; CHECK-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr0_sgpr1
337+
; CHECK-NEXT: renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
338+
; CHECK-NEXT: dead renamable $vgpr9 = COPY renamable $vgpr8
339+
; CHECK-NEXT: {{ $}}
340+
; CHECK-NEXT: bb.1:
341+
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
342+
; CHECK-NEXT: liveins: $vcc, $vgpr0_vgpr1
343+
; CHECK-NEXT: {{ $}}
344+
; CHECK-NEXT: renamable $agpr0_agpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
345+
; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_mac_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
346+
; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_mac_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
347+
; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
348+
; CHECK-NEXT: S_BRANCH %bb.2
349+
; CHECK-NEXT: {{ $}}
350+
; CHECK-NEXT: bb.2:
351+
; CHECK-NEXT: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15:0x00000000FFFFFFFF
352+
; CHECK-NEXT: {{ $}}
353+
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
354+
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
355+
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
356+
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
357+
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
358+
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
359+
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
360+
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
361+
; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
362+
; CHECK-NEXT: INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 27983881 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
363+
; CHECK-NEXT: S_ENDPGM 0
364+
bb.0:
365+
S_NOP 0, implicit-def $agpr0
366+
renamable $sgpr0 = S_MOV_B32 0
367+
undef %0.sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
368+
renamable $sgpr1 = COPY renamable $sgpr0
369+
%1:vreg_64_align2 = COPY killed renamable $sgpr0_sgpr1
370+
renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
371+
%0.sub9:vreg_512_align2 = COPY %0.sub8
372+
373+
bb.1:
374+
liveins: $vcc
375+
376+
undef %0.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
377+
%0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec
378+
%0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec
379+
S_CBRANCH_VCCNZ %bb.1, implicit $vcc
380+
S_BRANCH %bb.2
381+
382+
bb.2:
383+
; No VGPRs available for %0
384+
S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
385+
S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
386+
S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
387+
S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
388+
S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
389+
S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
390+
S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
391+
S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
392+
INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 27983881 /* reguse:VReg_512_Align2 */, %0:vreg_512_align2
393+
S_ENDPGM 0
394+
395+
...

0 commit comments

Comments
 (0)