-
Notifications
You must be signed in to change notification settings - Fork 15.4k
AMDGPU: Handle rewriting VGPR MFMA to AGPR with subregister copies #153019
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AMDGPU: Handle rewriting VGPR MFMA to AGPR with subregister copies #153019
Conversation
|
@llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesThis should address the case where the result isn't fully used, Patch is 56.45 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/153019.diff 6 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
index a8dfdbe5dd494..5206f32ec99e5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
@@ -14,8 +14,6 @@
/// MFMA opcode.
///
/// TODO:
-/// - Handle SplitKit partial copy bundles, and not just full copy instructions
-///
/// - Update LiveIntervals incrementally instead of recomputing from scratch
///
//===----------------------------------------------------------------------===//
@@ -37,6 +35,7 @@ using namespace llvm;
namespace {
class AMDGPURewriteAGPRCopyMFMAImpl {
+ MachineFunction &MF;
const GCNSubtarget &ST;
const SIInstrInfo &TII;
const SIRegisterInfo &TRI;
@@ -53,7 +52,7 @@ class AMDGPURewriteAGPRCopyMFMAImpl {
AMDGPURewriteAGPRCopyMFMAImpl(MachineFunction &MF, VirtRegMap &VRM,
LiveRegMatrix &LRM, LiveIntervals &LIS,
const RegisterClassInfo &RegClassInfo)
- : ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
+ : MF(MF), ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
TRI(*ST.getRegisterInfo()), MRI(MF.getRegInfo()), VRM(VRM), LRM(LRM),
LIS(LIS), RegClassInfo(RegClassInfo) {}
@@ -71,26 +70,26 @@ class AMDGPURewriteAGPRCopyMFMAImpl {
///
/// \p RewriteRegs will accumulate the set of register used by those MFMAs
/// that need to have the register classes adjusted.
- const TargetRegisterClass *recomputeRegClassExceptRewritable(
- Register Reg, const TargetRegisterClass *OldRC,
- const TargetRegisterClass *NewRC,
- SmallVectorImpl<MachineInstr *> &RewriteCandidates,
+ bool recomputeRegClassExceptRewritable(
+ Register Reg, SmallVectorImpl<MachineInstr *> &RewriteCandidates,
SmallSetVector<Register, 4> &RewriteRegs) const;
bool run(MachineFunction &MF) const;
};
-const TargetRegisterClass *
-AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExceptRewritable(
- Register StartReg, const TargetRegisterClass *OldRC,
- const TargetRegisterClass *NewRC,
- SmallVectorImpl<MachineInstr *> &RewriteCandidates,
+bool AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExceptRewritable(
+ Register StartReg, SmallVectorImpl<MachineInstr *> &RewriteCandidates,
SmallSetVector<Register, 4> &RewriteRegs) const {
SmallVector<Register, 8> Worklist = {StartReg};
// Recursively visit all transitive MFMA users
while (!Worklist.empty()) {
Register Reg = Worklist.pop_back_val();
+ const TargetRegisterClass *OldRC = MRI.getRegClass(Reg);
+
+ // Inflate to the equivalent AV_* class.
+ const TargetRegisterClass *NewRC = TRI.getLargestLegalSuperClass(OldRC, MF);
+
// Accumulate constraints from all uses.
for (MachineOperand &MO : MRI.reg_nodbg_operands(Reg)) {
// Apply the effect of the given operand to NewRC.
@@ -101,23 +100,40 @@ AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExceptRewritable(
// either AGPR or VGPR in src0/src1, so don't bother checking the
// constraint effects of the individual operands.
if (isRewriteCandidate(*MI)) {
- for (AMDGPU::OpName OpName :
- {AMDGPU::OpName::vdst, AMDGPU::OpName::src2}) {
- const MachineOperand *Op = TII.getNamedOperand(*MI, OpName);
+ const MachineOperand *VDst =
+ TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
+ const MachineOperand *Src2 =
+ TII.getNamedOperand(*MI, AMDGPU::OpName::src2);
+ for (const MachineOperand *Op : {VDst, Src2}) {
if (!Op->isReg())
continue;
Register OtherReg = Op->getReg();
- if (OtherReg != Reg) {
- if (RewriteRegs.insert(OtherReg))
- Worklist.push_back(OtherReg);
- }
+ if (OtherReg.isPhysical())
+ return false;
+
+ if (OtherReg != Reg && RewriteRegs.insert(OtherReg))
+ Worklist.push_back(OtherReg);
}
- LLVM_DEBUG(dbgs() << "Ignoring effects of " << *MI);
+ if (!is_contained(RewriteCandidates, MI)) {
+ LLVM_DEBUG({
+ Register VDstPhysReg = VRM.getPhys(VDst->getReg());
+ dbgs() << "Attempting to replace VGPR MFMA with AGPR version:"
+ << " Dst=[" << printReg(VDst->getReg()) << " => "
+ << printReg(VDstPhysReg, &TRI);
+
+ if (Src2->isReg()) {
+ Register Src2PhysReg = VRM.getPhys(Src2->getReg());
+ dbgs() << ", Src2=[" << printReg(Src2->getReg(), &TRI) << " => "
+ << printReg(Src2PhysReg, &TRI);
+ }
+
+ dbgs() << "]: " << MI;
+ });
- if (!is_contained(RewriteCandidates, MI))
RewriteCandidates.push_back(MI);
+ }
continue;
}
@@ -126,13 +142,14 @@ AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExceptRewritable(
NewRC = MI->getRegClassConstraintEffect(OpNo, NewRC, &TII, &TRI);
if (!NewRC || NewRC == OldRC) {
LLVM_DEBUG(dbgs() << "User of " << printReg(Reg, &TRI)
- << " cannot be reassigned to AGPR: " << *MI);
- return nullptr;
+ << " cannot be reassigned to "
+ << TRI.getRegClassName(NewRC) << ": " << *MI);
+ return false;
}
}
}
- return NewRC;
+ return true;
}
/// Attempt to reassign the registers in \p InterferingRegs to be AGPRs, with a
@@ -225,10 +242,7 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
for (VNInfo *VNI : LI.vnis()) {
MachineInstr *DefMI = LIS.getInstructionFromIndex(VNI->def);
-
- // TODO: Handle SplitKit produced copy bundles for partially defined
- // registers.
- if (!DefMI || !DefMI->isFullCopy())
+ if (!DefMI || !DefMI->isCopy())
continue;
Register MFMADstReg = DefMI->getOperand(1).getReg();
@@ -241,34 +255,6 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
if (!MFMA || !isRewriteCandidate(*MFMA))
continue;
- MachineOperand *Src2 = TII.getNamedOperand(*MFMA, AMDGPU::OpName::src2);
- Register Src2Reg;
- if (Src2->isReg()) {
- Src2Reg = Src2->getReg();
- if (!Src2Reg.isVirtual())
- continue;
- }
-
- // FIXME: getMinimalPhysRegClass returns a nonsense AV_* subclass instead
- // of an AGPR or VGPR subclass, so we can't simply use the result on the
- // assignment.
-
- LLVM_DEBUG({
- dbgs() << "Attempting to replace VGPR MFMA with AGPR version:"
- << " Dst=[" << printReg(VReg) << " => "
- << printReg(PhysReg, &TRI);
-
- if (Src2Reg) {
- Register Src2PhysReg = VRM.getPhys(Src2Reg);
- dbgs() << ", Src2=[" << printReg(Src2Reg, &TRI) << " => "
- << printReg(Src2PhysReg, &TRI);
- }
-
- dbgs() << "]: " << *MFMA;
- });
-
- const TargetRegisterClass *DstVirtRegRC = MRI.getRegClass(MFMADstReg);
-
// src2 and dst have the same physical class constraint; try to preserve
// the original src2 subclass if one were to exist.
SmallVector<MachineInstr *, 4> RewriteCandidates = {MFMA};
@@ -287,11 +273,9 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
//
// Note recomputeRegClassExceptRewritable will consider the constraints of
// this MFMA's src2 as well as the src2/dst of any transitive MFMA users.
- const TargetRegisterClass *DstExceptRC =
- recomputeRegClassExceptRewritable(MFMADstReg, DstVirtRegRC, VirtRegRC,
- RewriteCandidates, RewriteRegs);
- if (!DstExceptRC) {
- LLVM_DEBUG(dbgs() << "Could not recompute the regclass of "
+ if (!recomputeRegClassExceptRewritable(MFMADstReg, RewriteCandidates,
+ RewriteRegs)) {
+ LLVM_DEBUG(dbgs() << "Could not recompute the regclass of dst reg "
<< printReg(MFMADstReg, &TRI) << '\n');
continue;
}
diff --git a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-agpr-negative-tests.mir b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-agpr-negative-tests.mir
index 3103d635200c6..45c185b6154ea 100644
--- a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-agpr-negative-tests.mir
+++ b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-agpr-negative-tests.mir
@@ -20,10 +20,6 @@
ret void
}
- define amdgpu_kernel void @inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_src2_different_subreg() #0 {
- ret void
- }
-
define amdgpu_kernel void @inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_chain_no_agprs_first() #1 {
ret void
}
@@ -420,93 +416,6 @@ body: |
...
-# Non-mac variant, src2 is the same VGPR, but a different subregister.
----
-name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_src2_different_subreg
-tracksRegLiveness: true
-machineFunctionInfo:
- isEntryFunction: true
- stackPtrOffsetReg: '$sgpr32'
- occupancy: 10
- sgprForEXECCopy: '$sgpr100_sgpr101'
-body: |
- ; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_src2_different_subreg
- ; CHECK: bb.0:
- ; CHECK-NEXT: successors: %bb.1(0x80000000)
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: S_NOP 0, implicit-def $agpr0
- ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 0
- ; CHECK-NEXT: renamable $vgpr8 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr0
- ; CHECK-NEXT: renamable $vgpr18_vgpr19 = COPY killed renamable $sgpr0_sgpr1
- ; CHECK-NEXT: renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
- ; CHECK-NEXT: dead renamable $vgpr9 = COPY renamable $vgpr8
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: bb.1:
- ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
- ; CHECK-NEXT: liveins: $vcc, $vgpr18_vgpr19
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
- ; CHECK-NEXT: renamable $vgpr16_vgpr17 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
- ; CHECK-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
- ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
- ; CHECK-NEXT: S_BRANCH %bb.2
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: bb.2:
- ; CHECK-NEXT: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31:0x00000000FFFFFFFF
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
- ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
- ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
- ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
- ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
- ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
- ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
- ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
- ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr8_agpr9_agpr10_agpr11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
- ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr12_agpr13_agpr14_agpr15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
- ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr0_agpr1_agpr2_agpr3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
- ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr0, killed renamable $agpr4_agpr5_agpr6_agpr7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
- ; CHECK-NEXT: S_ENDPGM 0
- bb.0:
- S_NOP 0, implicit-def $agpr0
- renamable $sgpr0 = S_MOV_B32 0
- undef %0.sub8:vreg_1024_align2 = V_MOV_B32_e32 0, implicit $exec
- renamable $sgpr1 = COPY renamable $sgpr0
- %1:vreg_64_align2 = COPY killed renamable $sgpr0_sgpr1
- renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
- %0.sub9:vreg_1024_align2 = COPY %0.sub8
-
- bb.1:
- liveins: $vcc
-
- undef %0.sub0_sub1:vreg_1024_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
- %0.sub16_sub17:vreg_1024_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
- %0.sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15:vreg_1024_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0.sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23_sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31, 0, 0, 0, implicit $mode, implicit $exec
- S_CBRANCH_VCCNZ %bb.1, implicit $vcc
- S_BRANCH %bb.2
-
- bb.2:
- ; No VGPRs available for %0
- S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
- S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
- S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
- S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
- S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
- S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
- S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
- S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
- %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub8_sub9_sub10_sub11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
- GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub12_sub13_sub14_sub15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
- GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub0_sub1_sub2_sub3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
- GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub4_sub5_sub6_sub7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
- S_ENDPGM 0
-
-...
-
# There isn't an assignable AGPR around the first MFMA.
---
name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_chain_no_agprs_first
diff --git a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir
index 3de86da766af7..735c7e081b21a 100644
--- a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir
+++ b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir
@@ -1116,11 +1116,8 @@ body: |
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
- ; CHECK-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $agpr0_agpr1
; CHECK-NEXT: renamable $vgpr2_vgpr3 = SI_SPILL_AV64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
- ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 killed $vgpr2_vgpr3, $vgpr2_vgpr3, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: renamable $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
- ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY renamable $vgpr0_vgpr1_vgpr2_vgpr3
+ ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_mac_e64 killed $vgpr2_vgpr3, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
@@ -1202,10 +1199,8 @@ body: |
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
; CHECK-NEXT: renamable $vgpr0_vgpr1 = SI_SPILL_AV64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
- ; CHECK-NEXT: renamable $vgpr18_vgpr19 = COPY killed renamable $agpr0_agpr1
- ; CHECK-NEXT: early-clobber renamable $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F32_32X32X8F16_vgprcd_e64 killed $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: renamable $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17
- ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY renamable $vgpr2_vgpr3_vgpr4_vgpr5
+ ; CHECK-NEXT: renamable $agpr16_agpr17 = COPY killed renamable $agpr0_agpr1
+ ; CHECK-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_e64 killed $vgpr0_vgpr1, $vgpr0_vgpr1, $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
@@ -1957,3 +1952,89 @@ body: |
S_ENDPGM 0
...
+
+# Non-mac variant, src2 is the same VGPR, but a different subregister.
+---
+name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_src2_different_subreg
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ stackPtrOffsetReg: '$sgpr32'
+ occupancy: 10
+ sgprForEXECCopy: '$sgpr100_sgpr101'
+body: |
+ ; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_src2_different_subreg
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_NOP 0, implicit-def $agpr0
+ ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 0
+ ; CHECK-NEXT: renamable $vgpr8 = V_MOV_B32_e32 0,...
[truncated]
|
| ; CHECK-NEXT: renamable $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 | ||
| ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY renamable $vgpr2_vgpr3_vgpr4_vgpr5 | ||
| ; CHECK-NEXT: renamable $agpr16_agpr17 = COPY killed renamable $agpr0_agpr1 | ||
| ; CHECK-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_e64 killed $vgpr0_vgpr1, $vgpr0_vgpr1, $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 0, 0, 0, implicit $mode, implicit $exec |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How did we end up changing V_MFMA_F32_32X32X8F16_vgprcd_e64 ti V_MFMA_F32_32X32X8F16_e64?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This example is the whole point of doing this. Since the MFMA result isn't fully used, SplitKit only inserted a copy bundle of only the lanes that needed to be live, and not a full copy. This now treats the partial copies as starting points for the rewrite
3956361 to
603f8df
Compare
3776f12 to
69cd540
Compare
603f8df to
d4de0b2
Compare
|
ping |
perlfu
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, with one minor question in comments
69cd540 to
dd011f2
Compare
d4de0b2 to
2ad172e
Compare
dd011f2 to
8b45c13
Compare
2ad172e to
7619736
Compare
This should address the case where the result isn't fully used, resulting in partial copy bundles from the MFMA result.
7619736 to
92a0a65
Compare

This should address the case where the result isn't fully used,
resulting in partial copy bundles from the MFMA result.