Skip to content

Commit 9f7b0be

Browse files
committed
AMDGPU: Handle rewriting VGPR MFMAs with immediate src2
This can follow the simpler tied operand handling path.
1 parent bcdb0d7 commit 9f7b0be

File tree

3 files changed

+96
-99
lines changed

3 files changed

+96
-99
lines changed

llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -206,27 +206,27 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
206206
continue;
207207

208208
MachineOperand *Src2 = TII.getNamedOperand(*MFMA, AMDGPU::OpName::src2);
209-
if (!Src2->isReg())
210-
continue;
211-
212-
Register Src2Reg = Src2->getReg();
213-
if (!Src2Reg.isVirtual())
214-
continue;
209+
Register Src2Reg;
210+
if (Src2->isReg()) {
211+
Src2Reg = Src2->getReg();
212+
if (!Src2Reg.isVirtual())
213+
continue;
214+
}
215215

216216
// FIXME: getMinimalPhysRegClass returns a nonsense AV_* subclass instead
217217
// of an AGPR or VGPR subclass, so we can't simply use the result on the
218218
// assignment.
219219

220220
LLVM_DEBUG({
221-
Register Src2PhysReg = VRM.getPhys(Src2->getReg());
221+
Register Src2PhysReg = VRM.getPhys(Src2Reg);
222222
dbgs() << "Attempting to replace VGPR MFMA with AGPR version:"
223223
<< " Dst=[" << printReg(VReg) << " => "
224224
<< printReg(PhysReg, &TRI) << "], Src2=["
225225
<< printReg(Src2->getReg(), &TRI) << " => "
226226
<< printReg(Src2PhysReg, &TRI) << "]: " << *MFMA;
227227
});
228228

229-
const TargetRegisterClass *DstVirtRegRC = MRI.getRegClass(Src2->getReg());
229+
const TargetRegisterClass *DstVirtRegRC = MRI.getRegClass(MFMADstReg);
230230
const TargetRegisterClass *NewDstConstraintRC =
231231
TII.getRegClass(TII.get(AGPROp), 0, &TRI, MF);
232232
const TargetRegisterClass *NewSrc2ConstraintRC = NewDstConstraintRC;
@@ -236,8 +236,8 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
236236
MF) &&
237237
"expected src2 and dst to have same class constraint");
238238

239-
const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2Reg);
240-
239+
// src2 and dst have the same physical class constraint; try to preserve
240+
// the original src2 subclass if one were to exist.
241241
SmallVector<MachineInstr *, 4> DstRewriteCandidates;
242242

243243
// We've found av = COPY (MFMA), and need to verify that we can trivially
@@ -267,8 +267,10 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
267267

268268
// If the inputs are tied and the same register, we can shortcut and
269269
// directly replace the register.
270-
if (Src2->getReg() != MFMADstReg ||
271-
Src2->getSubReg() != DefMI->getOperand(1).getSubReg()) {
270+
if (Src2Reg && (Src2Reg != MFMADstReg ||
271+
Src2->getSubReg() != DefMI->getOperand(1).getSubReg())) {
272+
const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2Reg);
273+
272274
// If src2 and dst are different registers, we need to also reassign the
273275
// input to an available AGPR if it is compatible with all other uses.
274276
//

llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-agpr-negative-tests.mir

Lines changed: 0 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,6 @@
1616
ret void
1717
}
1818

19-
define amdgpu_kernel void @inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_imm_src2() #0 {
20-
ret void
21-
}
22-
2319
define amdgpu_kernel void @inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_physreg_src2() #0 {
2420
ret void
2521
}
@@ -345,89 +341,6 @@ body: |
345341
346342
...
347343

348-
# Non-mac variant, src2 is an immediate.
349-
---
350-
name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_imm_src2
351-
tracksRegLiveness: true
352-
machineFunctionInfo:
353-
isEntryFunction: true
354-
stackPtrOffsetReg: '$sgpr32'
355-
occupancy: 10
356-
sgprForEXECCopy: '$sgpr100_sgpr101'
357-
body: |
358-
; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_imm_src2
359-
; CHECK: bb.0:
360-
; CHECK-NEXT: successors: %bb.1(0x80000000)
361-
; CHECK-NEXT: {{ $}}
362-
; CHECK-NEXT: S_NOP 0, implicit-def $agpr0
363-
; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 0
364-
; CHECK-NEXT: renamable $vgpr8 = V_MOV_B32_e32 0, implicit $exec
365-
; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr0
366-
; CHECK-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr0_sgpr1
367-
; CHECK-NEXT: renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
368-
; CHECK-NEXT: dead renamable $vgpr9 = COPY renamable $vgpr8
369-
; CHECK-NEXT: {{ $}}
370-
; CHECK-NEXT: bb.1:
371-
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
372-
; CHECK-NEXT: liveins: $vcc, $vgpr0_vgpr1
373-
; CHECK-NEXT: {{ $}}
374-
; CHECK-NEXT: early-clobber renamable $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, 0, implicit $mode, implicit $exec
375-
; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
376-
; CHECK-NEXT: S_BRANCH %bb.2
377-
; CHECK-NEXT: {{ $}}
378-
; CHECK-NEXT: bb.2:
379-
; CHECK-NEXT: liveins: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17:0x00000000FFFFFFFF
380-
; CHECK-NEXT: {{ $}}
381-
; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17
382-
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
383-
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
384-
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
385-
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
386-
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
387-
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
388-
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
389-
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
390-
; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
391-
; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr8_agpr9_agpr10_agpr11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
392-
; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr12_agpr13_agpr14_agpr15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
393-
; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr0_agpr1_agpr2_agpr3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
394-
; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr0, killed renamable $agpr4_agpr5_agpr6_agpr7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
395-
; CHECK-NEXT: S_ENDPGM 0
396-
bb.0:
397-
S_NOP 0, implicit-def $agpr0
398-
renamable $sgpr0 = S_MOV_B32 0
399-
undef %0.sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
400-
renamable $sgpr1 = COPY renamable $sgpr0
401-
%1:vreg_64_align2 = COPY killed renamable $sgpr0_sgpr1
402-
renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
403-
%0.sub9:vreg_512_align2 = COPY %0.sub8
404-
405-
bb.1:
406-
liveins: $vcc
407-
408-
%0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, 0, 0, 0, 0, implicit $mode, implicit $exec
409-
S_CBRANCH_VCCNZ %bb.1, implicit $vcc
410-
S_BRANCH %bb.2
411-
412-
bb.2:
413-
; No VGPRs available for %0
414-
S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
415-
S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
416-
S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
417-
S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
418-
S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
419-
S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
420-
S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
421-
S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
422-
%2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
423-
GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub8_sub9_sub10_sub11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
424-
GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub12_sub13_sub14_sub15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
425-
GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub0_sub1_sub2_sub3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
426-
GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub4_sub5_sub6_sub7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
427-
S_ENDPGM 0
428-
429-
...
430-
431344
# Non-mac variant, src2 is a physical register
432345
---
433346
name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_physreg_src2

llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1183,3 +1183,85 @@ body: |
11831183
S_ENDPGM 0
11841184
11851185
...
1186+
1187+
# Non-mac variant, src2 is an immediate.
1188+
---
1189+
name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_imm_src2
1190+
tracksRegLiveness: true
1191+
machineFunctionInfo:
1192+
isEntryFunction: true
1193+
stackPtrOffsetReg: '$sgpr32'
1194+
occupancy: 10
1195+
sgprForEXECCopy: '$sgpr100_sgpr101'
1196+
body: |
1197+
; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_imm_src2
1198+
; CHECK: bb.0:
1199+
; CHECK-NEXT: successors: %bb.1(0x80000000)
1200+
; CHECK-NEXT: {{ $}}
1201+
; CHECK-NEXT: S_NOP 0, implicit-def $agpr0
1202+
; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 0
1203+
; CHECK-NEXT: renamable $vgpr8 = V_MOV_B32_e32 0, implicit $exec
1204+
; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr0
1205+
; CHECK-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr0_sgpr1
1206+
; CHECK-NEXT: renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
1207+
; CHECK-NEXT: dead renamable $vgpr9 = COPY renamable $vgpr8
1208+
; CHECK-NEXT: {{ $}}
1209+
; CHECK-NEXT: bb.1:
1210+
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
1211+
; CHECK-NEXT: liveins: $vcc, $vgpr0_vgpr1
1212+
; CHECK-NEXT: {{ $}}
1213+
; CHECK-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, 0, implicit $mode, implicit $exec
1214+
; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
1215+
; CHECK-NEXT: S_BRANCH %bb.2
1216+
; CHECK-NEXT: {{ $}}
1217+
; CHECK-NEXT: bb.2:
1218+
; CHECK-NEXT: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15:0x00000000FFFFFFFF
1219+
; CHECK-NEXT: {{ $}}
1220+
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
1221+
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
1222+
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
1223+
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
1224+
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
1225+
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
1226+
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
1227+
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
1228+
; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
1229+
; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr8_agpr9_agpr10_agpr11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
1230+
; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr12_agpr13_agpr14_agpr15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
1231+
; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr0_agpr1_agpr2_agpr3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
1232+
; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr0, killed renamable $agpr4_agpr5_agpr6_agpr7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
1233+
; CHECK-NEXT: S_ENDPGM 0
1234+
bb.0:
1235+
S_NOP 0, implicit-def $agpr0
1236+
renamable $sgpr0 = S_MOV_B32 0
1237+
undef %0.sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
1238+
renamable $sgpr1 = COPY renamable $sgpr0
1239+
%1:vreg_64_align2 = COPY killed renamable $sgpr0_sgpr1
1240+
renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
1241+
%0.sub9:vreg_512_align2 = COPY %0.sub8
1242+
1243+
bb.1:
1244+
liveins: $vcc
1245+
1246+
%0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, 0, 0, 0, 0, implicit $mode, implicit $exec
1247+
S_CBRANCH_VCCNZ %bb.1, implicit $vcc
1248+
S_BRANCH %bb.2
1249+
1250+
bb.2:
1251+
; No VGPRs available for %0
1252+
S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
1253+
S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
1254+
S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
1255+
S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
1256+
S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
1257+
S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
1258+
S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
1259+
S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
1260+
%2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
1261+
GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub8_sub9_sub10_sub11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
1262+
GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub12_sub13_sub14_sub15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
1263+
GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub0_sub1_sub2_sub3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
1264+
GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub4_sub5_sub6_sub7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
1265+
S_ENDPGM 0
1266+
1267+
...

0 commit comments

Comments
 (0)