Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 47 additions & 61 deletions llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@
/// MFMA opcode.
///
/// TODO:
/// - Handle SplitKit partial copy bundles, and not just full copy instructions
///
/// - Update LiveIntervals incrementally instead of recomputing from scratch
///
//===----------------------------------------------------------------------===//
Expand All @@ -37,6 +35,7 @@ using namespace llvm;
namespace {

class AMDGPURewriteAGPRCopyMFMAImpl {
MachineFunction &MF;
const GCNSubtarget &ST;
const SIInstrInfo &TII;
const SIRegisterInfo &TRI;
Expand All @@ -53,7 +52,7 @@ class AMDGPURewriteAGPRCopyMFMAImpl {
AMDGPURewriteAGPRCopyMFMAImpl(MachineFunction &MF, VirtRegMap &VRM,
LiveRegMatrix &LRM, LiveIntervals &LIS,
const RegisterClassInfo &RegClassInfo)
: ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
: MF(MF), ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
TRI(*ST.getRegisterInfo()), MRI(MF.getRegInfo()), VRM(VRM), LRM(LRM),
LIS(LIS), RegClassInfo(RegClassInfo) {}

Expand All @@ -71,26 +70,28 @@ class AMDGPURewriteAGPRCopyMFMAImpl {
///
/// \p RewriteRegs will accumulate the set of register used by those MFMAs
/// that need to have the register classes adjusted.
const TargetRegisterClass *recomputeRegClassExceptRewritable(
Register Reg, const TargetRegisterClass *OldRC,
const TargetRegisterClass *NewRC,
SmallVectorImpl<MachineInstr *> &RewriteCandidates,
bool recomputeRegClassExceptRewritable(
Register Reg, SmallVectorImpl<MachineInstr *> &RewriteCandidates,
SmallSetVector<Register, 4> &RewriteRegs) const;

bool run(MachineFunction &MF) const;
};

const TargetRegisterClass *
AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExceptRewritable(
Register StartReg, const TargetRegisterClass *OldRC,
const TargetRegisterClass *NewRC,
SmallVectorImpl<MachineInstr *> &RewriteCandidates,
bool AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExceptRewritable(
Register StartReg, SmallVectorImpl<MachineInstr *> &RewriteCandidates,
SmallSetVector<Register, 4> &RewriteRegs) const {
SmallVector<Register, 8> Worklist = {StartReg};

// Recursively visit all transitive MFMA users
while (!Worklist.empty()) {
Register Reg = Worklist.pop_back_val();
const TargetRegisterClass *OldRC = MRI.getRegClass(Reg);

// Inflate to the equivalent AV_* class.
const TargetRegisterClass *NewRC = TRI.getLargestLegalSuperClass(OldRC, MF);
if (OldRC == NewRC)
return false;

// Accumulate constraints from all uses.
for (MachineOperand &MO : MRI.reg_nodbg_operands(Reg)) {
// Apply the effect of the given operand to NewRC.
Expand All @@ -101,23 +102,40 @@ AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExceptRewritable(
// either AGPR or VGPR in src0/src1, so don't bother checking the
// constraint effects of the individual operands.
if (isRewriteCandidate(*MI)) {
for (AMDGPU::OpName OpName :
{AMDGPU::OpName::vdst, AMDGPU::OpName::src2}) {
const MachineOperand *Op = TII.getNamedOperand(*MI, OpName);
const MachineOperand *VDst =
TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
const MachineOperand *Src2 =
TII.getNamedOperand(*MI, AMDGPU::OpName::src2);
for (const MachineOperand *Op : {VDst, Src2}) {
if (!Op->isReg())
continue;

Register OtherReg = Op->getReg();
if (OtherReg != Reg) {
if (RewriteRegs.insert(OtherReg))
Worklist.push_back(OtherReg);
}
if (OtherReg.isPhysical())
return false;

if (OtherReg != Reg && RewriteRegs.insert(OtherReg))
Worklist.push_back(OtherReg);
}

LLVM_DEBUG(dbgs() << "Ignoring effects of " << *MI);
if (!is_contained(RewriteCandidates, MI)) {
LLVM_DEBUG({
Register VDstPhysReg = VRM.getPhys(VDst->getReg());
dbgs() << "Attempting to replace VGPR MFMA with AGPR version:"
<< " Dst=[" << printReg(VDst->getReg()) << " => "
<< printReg(VDstPhysReg, &TRI);

if (Src2->isReg()) {
Register Src2PhysReg = VRM.getPhys(Src2->getReg());
dbgs() << "], Src2=[" << printReg(Src2->getReg(), &TRI) << " => "
<< printReg(Src2PhysReg, &TRI);
}

dbgs() << "]: " << MI;
});

if (!is_contained(RewriteCandidates, MI))
RewriteCandidates.push_back(MI);
}

continue;
}
Expand All @@ -126,13 +144,14 @@ AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExceptRewritable(
NewRC = MI->getRegClassConstraintEffect(OpNo, NewRC, &TII, &TRI);
if (!NewRC || NewRC == OldRC) {
LLVM_DEBUG(dbgs() << "User of " << printReg(Reg, &TRI)
<< " cannot be reassigned to AGPR: " << *MI);
return nullptr;
<< " cannot be reassigned to "
<< TRI.getRegClassName(NewRC) << ": " << *MI);
return false;
}
}
}

return NewRC;
return true;
}

/// Attempt to reassign the registers in \p InterferingRegs to be AGPRs, with a
Expand Down Expand Up @@ -228,10 +247,7 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
continue;

MachineInstr *DefMI = LIS.getInstructionFromIndex(VNI->def);

// TODO: Handle SplitKit produced copy bundles for partially defined
// registers.
if (!DefMI || !DefMI->isFullCopy())
if (!DefMI || !DefMI->isCopy())
continue;

Register MFMADstReg = DefMI->getOperand(1).getReg();
Expand All @@ -244,34 +260,6 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
if (!MFMA || !isRewriteCandidate(*MFMA))
continue;

MachineOperand *Src2 = TII.getNamedOperand(*MFMA, AMDGPU::OpName::src2);
Register Src2Reg;
if (Src2->isReg()) {
Src2Reg = Src2->getReg();
if (!Src2Reg.isVirtual())
continue;
}

// FIXME: getMinimalPhysRegClass returns a nonsense AV_* subclass instead
// of an AGPR or VGPR subclass, so we can't simply use the result on the
// assignment.

LLVM_DEBUG({
dbgs() << "Attempting to replace VGPR MFMA with AGPR version:"
<< " Dst=[" << printReg(VReg) << " => "
<< printReg(PhysReg, &TRI);

if (Src2Reg) {
Register Src2PhysReg = VRM.getPhys(Src2Reg);
dbgs() << "], Src2=[" << printReg(Src2Reg, &TRI) << " => "
<< printReg(Src2PhysReg, &TRI);
}

dbgs() << "]: " << *MFMA;
});

const TargetRegisterClass *DstVirtRegRC = MRI.getRegClass(MFMADstReg);

// src2 and dst have the same physical class constraint; try to preserve
// the original src2 subclass if one were to exist.
SmallVector<MachineInstr *, 4> RewriteCandidates = {MFMA};
Expand All @@ -290,11 +278,9 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
//
// Note recomputeRegClassExceptRewritable will consider the constraints of
// this MFMA's src2 as well as the src2/dst of any transitive MFMA users.
const TargetRegisterClass *DstExceptRC =
recomputeRegClassExceptRewritable(MFMADstReg, DstVirtRegRC, VirtRegRC,
RewriteCandidates, RewriteRegs);
if (!DstExceptRC) {
LLVM_DEBUG(dbgs() << "Could not recompute the regclass of "
if (!recomputeRegClassExceptRewritable(MFMADstReg, RewriteCandidates,
RewriteRegs)) {
LLVM_DEBUG(dbgs() << "Could not recompute the regclass of dst reg "
<< printReg(MFMADstReg, &TRI) << '\n');
continue;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,6 @@
ret void
}

define amdgpu_kernel void @inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_src2_different_subreg() #0 {
ret void
}

define amdgpu_kernel void @inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_chain_no_agprs_first() #1 {
ret void
}
Expand Down Expand Up @@ -420,93 +416,6 @@ body: |

...

# Non-mac variant, src2 is the same VGPR, but a different subregister.
---
name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_src2_different_subreg
tracksRegLiveness: true
machineFunctionInfo:
isEntryFunction: true
stackPtrOffsetReg: '$sgpr32'
occupancy: 10
sgprForEXECCopy: '$sgpr100_sgpr101'
body: |
; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_src2_different_subreg
; CHECK: bb.0:
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_NOP 0, implicit-def $agpr0
; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 0
; CHECK-NEXT: renamable $vgpr8 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr0
; CHECK-NEXT: renamable $vgpr18_vgpr19 = COPY killed renamable $sgpr0_sgpr1
; CHECK-NEXT: renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
; CHECK-NEXT: dead renamable $vgpr9 = COPY renamable $vgpr8
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK-NEXT: liveins: $vcc, $vgpr18_vgpr19
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
; CHECK-NEXT: renamable $vgpr16_vgpr17 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
; CHECK-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
; CHECK-NEXT: S_BRANCH %bb.2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
; CHECK-NEXT: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31:0x00000000FFFFFFFF
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr8_agpr9_agpr10_agpr11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr12_agpr13_agpr14_agpr15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr0_agpr1_agpr2_agpr3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr0, killed renamable $agpr4_agpr5_agpr6_agpr7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
bb.0:
S_NOP 0, implicit-def $agpr0
renamable $sgpr0 = S_MOV_B32 0
undef %0.sub8:vreg_1024_align2 = V_MOV_B32_e32 0, implicit $exec
renamable $sgpr1 = COPY renamable $sgpr0
%1:vreg_64_align2 = COPY killed renamable $sgpr0_sgpr1
renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
%0.sub9:vreg_1024_align2 = COPY %0.sub8

bb.1:
liveins: $vcc

undef %0.sub0_sub1:vreg_1024_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
%0.sub16_sub17:vreg_1024_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
%0.sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15:vreg_1024_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0.sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23_sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31, 0, 0, 0, implicit $mode, implicit $exec
S_CBRANCH_VCCNZ %bb.1, implicit $vcc
S_BRANCH %bb.2

bb.2:
; No VGPRs available for %0
S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
%2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub8_sub9_sub10_sub11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub12_sub13_sub14_sub15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub0_sub1_sub2_sub3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub4_sub5_sub6_sub7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
S_ENDPGM 0

...

# There isn't an assignable AGPR around the first MFMA.
---
name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_chain_no_agprs_first
Expand Down
Loading