Skip to content

Commit 6d8e044

Browse files
committed
Named operand and stable partition applied
(cherry picked from commit ee6d876fcc3d84d6ea3a68b3eee1ce97e714b6e6)
1 parent 9fe1c13 commit 6d8e044

File tree

11 files changed

+749
-1096
lines changed

11 files changed

+749
-1096
lines changed

llvm/include/llvm/CodeGen/MachineRegisterInfo.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -867,13 +867,13 @@ class MachineRegisterInfo {
867867
return RegAllocHints.inBounds(VReg) ? &RegAllocHints[VReg] : nullptr;
868868
}
869869

870-
/// setRegAllocationAntiHint - Add a register allocation anti-hint for the
870+
/// addRegAllocAntiHint - Add a register allocation anti-hint for the
871871
/// specified virtual register. This tells the allocator to avoid allocating
872872
/// VReg to the same physical register as AntiHintVReg (or overlapping ones).
873-
void setRegAllocationAntiHint(Register VReg, Register AntiHintVReg) {
873+
void addRegAllocAntiHint(Register VReg, Register AntiHintVReg) {
874874
assert(VReg.isVirtual() && "Anti-hints are only for virtual registers");
875875
assert(AntiHintVReg.isVirtual() && "Anti-hint target must be virtual");
876-
AntiHintRegs.grow(Register::index2VirtReg(getNumVirtRegs()));
876+
AntiHintRegs.grow(VReg);
877877
SmallVector<Register, 4> &AntiHints = AntiHintRegs[VReg];
878878
// Avoid duplicates
879879
if (llvm::find(AntiHints, AntiHintVReg) == AntiHints.end())
@@ -884,7 +884,7 @@ class MachineRegisterInfo {
884884
void addRegAllocationAntiHints(Register VReg,
885885
ArrayRef<Register> AntiHintVRegs) {
886886
for (Register AntiHint : AntiHintVRegs)
887-
setRegAllocationAntiHint(VReg, AntiHint);
887+
addRegAllocAntiHint(VReg, AntiHint);
888888
}
889889

890890
/// clearRegAllocationAntiHints - Clear all anti-hints for a register.
@@ -915,7 +915,7 @@ class MachineRegisterInfo {
915915
/// VRM is the current virtual register map showing allocations made so far.
916916
void getPhysRegAntiHints(Register VReg,
917917
SmallVectorImpl<MCPhysReg> &PhysAntiHints,
918-
const VirtRegMap *VRM) const;
918+
const VirtRegMap &VRM) const;
919919

920920
/// markUsesInDebugValueAsUndef - Mark every DBG_VALUE referencing the
921921
/// specified register as undefined which causes the DBG_VALUE to be

llvm/lib/CodeGen/AllocationOrder.cpp

Lines changed: 22 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ AllocationOrder AllocationOrder::create(Register VirtReg, const VirtRegMap &VRM,
4848

4949
// Get anti-hints
5050
SmallVector<MCPhysReg, 16> AntiHintedPhysRegs;
51-
MRI.getPhysRegAntiHints(VirtReg, AntiHintedPhysRegs, &VRM);
51+
MRI.getPhysRegAntiHints(VirtReg, AntiHintedPhysRegs, VRM);
5252

5353
LLVM_DEBUG({
5454
if (!AntiHintedPhysRegs.empty()) {
@@ -84,29 +84,34 @@ AllocationOrder AllocationOrder::create(Register VirtReg, const VirtRegMap &VRM,
8484

8585
void AllocationOrder::applyAntiHints(ArrayRef<MCPhysReg> AntiHintedPhysRegs,
8686
const TargetRegisterInfo *TRI) {
87+
// Helper to check if a register overlaps with any anti-hint
88+
auto isAntiHinted = [&](MCPhysReg Reg) {
89+
return std::any_of(
90+
AntiHintedPhysRegs.begin(), AntiHintedPhysRegs.end(),
91+
[&](MCPhysReg AntiHint) { return TRI->regsOverlap(Reg, AntiHint); });
92+
};
93+
8794
// Create filtered order
8895
FilteredOrderStorage.clear();
89-
FilteredOrderStorage.reserve(Order.size());
90-
91-
// Add non-anti-hinted registers first
92-
for (MCPhysReg PhysReg : Order) {
93-
if (!is_contained(AntiHintedPhysRegs, PhysReg)) {
94-
FilteredOrderStorage.push_back(PhysReg);
95-
}
96-
}
96+
FilteredOrderStorage.assign(Order.begin(), Order.end());
9797

98-
// Add anti-hinted registers at the end as last resort
99-
for (MCPhysReg PhysReg : Order) {
100-
if (is_contained(AntiHintedPhysRegs, PhysReg)) {
101-
FilteredOrderStorage.push_back(PhysReg);
102-
}
103-
}
98+
// Partition: non-anti-hinted registers go first
99+
auto PartitionPoint = std::stable_partition(
100+
FilteredOrderStorage.begin(), FilteredOrderStorage.end(),
101+
[&](MCPhysReg Reg) { return !isAntiHinted(Reg); });
104102

105103
// Update Order
106104
Order = FilteredOrderStorage;
107105

108106
LLVM_DEBUG({
109-
dbgs() << "moved " << AntiHintedPhysRegs.size()
110-
<< " anti-hinted registers to end of allocation order\n";
107+
size_t NonAntiHintedCount =
108+
std::distance(FilteredOrderStorage.begin(), PartitionPoint);
109+
size_t AntiHintedCount =
110+
std::distance(PartitionPoint, FilteredOrderStorage.end());
111+
dbgs() << " Added " << NonAntiHintedCount
112+
<< " non-anti-hinted registers first\n"
113+
<< " Added " << AntiHintedCount
114+
<< " anti-hinted registers at the end\n"
115+
<< " Anti-hint filtering complete\n";
111116
});
112117
}

llvm/lib/CodeGen/AllocationOrder.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ class LiveRegMatrix;
3030

3131
class LLVM_LIBRARY_VISIBILITY AllocationOrder {
3232
const SmallVector<MCPhysReg, 16> Hints;
33+
// Used as storage if the Order received in the constructor needs to be
34+
// altered.
3335
SmallVector<MCPhysReg, 16> FilteredOrderStorage;
3436
ArrayRef<MCPhysReg> Order;
3537
// How far into the Order we can iterate. This is 0 if the AllocationOrder is

llvm/lib/CodeGen/MachineRegisterInfo.cpp

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -680,22 +680,19 @@ bool MachineRegisterInfo::isReservedRegUnit(unsigned Unit) const {
680680

681681
void MachineRegisterInfo::getPhysRegAntiHints(
682682
Register VReg, SmallVectorImpl<MCPhysReg> &PhysAntiHints,
683-
const VirtRegMap *VRM) const {
683+
const VirtRegMap &VRM) const {
684684
assert(VReg.isVirtual());
685-
if (!AntiHintRegs.inBounds(VReg) || !VRM)
685+
if (!AntiHintRegs.inBounds(VReg))
686686
return;
687687

688688
const SmallVector<Register, 4> &AntiHints = AntiHintRegs[VReg];
689-
const TargetRegisterInfo *TRI = getTargetRegisterInfo();
690689

691690
for (Register AntiHintVReg : AntiHints) {
692691
// Check if the anti-hinted register has been allocated
693-
if (VRM->hasPhys(AntiHintVReg)) {
694-
MCPhysReg PhysReg = VRM->getPhys(AntiHintVReg);
695-
// Add the physical register and all its aliases
696-
for (MCRegAliasIterator AI(PhysReg, TRI, true); AI.isValid(); ++AI) {
697-
PhysAntiHints.push_back(*AI);
698-
}
692+
if (VRM.hasPhys(AntiHintVReg)) {
693+
MCPhysReg PhysReg = VRM.getPhys(AntiHintVReg);
694+
// Add the physical register
695+
PhysAntiHints.push_back(PhysReg);
699696
}
700697
}
701698

llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp

Lines changed: 29 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
#include "SIRegisterInfo.h"
3838
#include "llvm/CodeGen/LiveIntervals.h"
3939
#include "llvm/CodeGen/MachineFunctionPass.h"
40+
#include "llvm/CodeGen/Register.h"
4041
#include "llvm/InitializePasses.h"
4142

4243
using namespace llvm;
@@ -253,37 +254,45 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
253254
TRI = ST.getRegisterInfo();
254255

255256
bool Changed = false;
256-
257-
// Single pass implementation
257+
// Add RA anti-hints to reduce MFMA hazard NOPs
258258
if (EnableAntiHintsForMFMARegs && ST.hasMAIInsts()) {
259259
// Max lookback window for RAW or WAW hazard
260260
constexpr unsigned MaxLookbackWindow = 19;
261261
for (const MachineBasicBlock &MBB : MF) {
262-
263-
SmallVector<std::pair<SlotIndex, SmallVector<Register, 4>>, 16>
264-
RecentMFMAs;
262+
SmallVector<SmallVector<Register, 4>, 16> RecentMFMAs;
265263
for (const MachineInstr &MI : MBB) {
266264
if (MI.isDebugInstr())
267265
continue;
268-
const SlotIndex CurrentSlot = LIS->getInstructionIndex(MI).getRegSlot();
266+
269267
// Handle MFMA instructions
270268
if (SIInstrInfo::isMFMA(MI)) {
271269
SmallVector<Register, 4> MFMARegisters;
272-
auto collectMFMARegister = [&](unsigned OpIdx) {
273-
if (OpIdx >= MI.getNumOperands())
270+
// Helper to get named operand
271+
auto collectNamedOperand = [&](AMDGPU::OpName OpName,
272+
const char *OpNameStr) {
273+
const MachineOperand *MO = TII->getNamedOperand(MI, OpName);
274+
if (!MO) {
275+
LLVM_DEBUG(dbgs() << " Named operand " << OpNameStr
276+
<< " not found\n");
274277
return;
275-
276-
const MachineOperand &MO = MI.getOperand(OpIdx);
277-
if (MO.isReg() && MO.getReg().isVirtual())
278-
MFMARegisters.push_back(MO.getReg());
278+
}
279+
if (MO->isReg() && MO->getReg().isVirtual()) {
280+
Register Reg = MO->getReg();
281+
const TargetRegisterClass *RC = MRI->getRegClass(Reg);
282+
// Only consider VGPRs
283+
if (TRI->hasVGPRs(RC))
284+
MFMARegisters.push_back(Reg);
285+
LLVM_DEBUG(dbgs() << " Collected " << OpNameStr << " : "
286+
<< printReg(Reg, TRI) << "\n");
287+
}
279288
};
280-
// Only collect Matrix C (operand 3) and destination (operand 0)
281-
// registers
282-
collectMFMARegister(0);
283-
collectMFMARegister(3);
284289

290+
// Collect destination and source C registers
291+
collectNamedOperand(AMDGPU::OpName::vdst, "vdst"); // Destination
292+
collectNamedOperand(AMDGPU::OpName::src2,
293+
"src2"); // Matrix C (accumulator)
285294
if (!MFMARegisters.empty()) {
286-
RecentMFMAs.emplace_back(CurrentSlot, std::move(MFMARegisters));
295+
RecentMFMAs.emplace_back(std::move(MFMARegisters));
287296
// Maintain window
288297
if (RecentMFMAs.size() > MaxLookbackWindow)
289298
RecentMFMAs.erase(RecentMFMAs.begin());
@@ -309,17 +318,13 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
309318
// Only process VGPR registers
310319
if (!TRI->isVGPRClass(CandidateRC))
311320
continue;
312-
313321
for (auto It = RecentMFMAs.rbegin(); It != RecentMFMAs.rend(); ++It) {
314-
const SmallVector<Register, 4> &MFMARegs = It->second;
322+
const SmallVector<Register, 4> &MFMARegs = *It;
315323
for (Register MFMAReg : MFMARegs) {
316-
// Verify register class compatibility
317-
const TargetRegisterClass *MFMARC = MRI->getRegClass(MFMAReg);
318-
if (!TRI->hasVGPRs(MFMARC))
319-
continue;
320-
321324
// Check if MFMA register is dead at current instruction
322325
const LiveInterval &MFMAInterval = LIS->getInterval(MFMAReg);
326+
const SlotIndex CurrentSlot =
327+
LIS->getInstructionIndex(MI).getRegSlot();
323328
if (!MFMAInterval.liveAt(CurrentSlot)) {
324329
// Add bi-directional anti-hints
325330
MRI->addRegAllocationAntiHints(CandidateReg, MFMAReg);

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll

Lines changed: 46 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -731,10 +731,10 @@ define amdgpu_kernel void @test_mfma_f64_4x4x4f64(ptr addrspace(1) %arg, double
731731
; GFX90A-VGPR-NEXT: s_nop 1
732732
; GFX90A-VGPR-NEXT: v_mfma_f64_4x4x4f64 v[0:1], v[2:3], v[4:5], 0
733733
; GFX90A-VGPR-NEXT: s_nop 3
734-
; GFX90A-VGPR-NEXT: v_mfma_f64_4x4x4f64 v[0:1], v[2:3], v[4:5], v[0:1] cbsz:1 abid:2 blgp:3
735-
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, 0
734+
; GFX90A-VGPR-NEXT: v_mfma_f64_4x4x4f64 v[2:3], v[2:3], v[4:5], v[0:1] cbsz:1 abid:2 blgp:3
735+
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, 0
736736
; GFX90A-VGPR-NEXT: s_nop 7
737-
; GFX90A-VGPR-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
737+
; GFX90A-VGPR-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
738738
; GFX90A-VGPR-NEXT: s_endpgm
739739
;
740740
; GFX942-VGPR-LABEL: test_mfma_f64_4x4x4f64:
@@ -747,10 +747,10 @@ define amdgpu_kernel void @test_mfma_f64_4x4x4f64(ptr addrspace(1) %arg, double
747747
; GFX942-VGPR-NEXT: s_nop 1
748748
; GFX942-VGPR-NEXT: v_mfma_f64_4x4x4_4b_f64 v[0:1], v[2:3], v[4:5], 0
749749
; GFX942-VGPR-NEXT: s_nop 3
750-
; GFX942-VGPR-NEXT: v_mfma_f64_4x4x4_4b_f64 v[0:1], v[2:3], v[4:5], v[0:1] cbsz:1 abid:2 neg:[1,1,0]
751-
; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, 0
750+
; GFX942-VGPR-NEXT: v_mfma_f64_4x4x4_4b_f64 v[2:3], v[2:3], v[4:5], v[0:1] cbsz:1 abid:2 neg:[1,1,0]
751+
; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0
752752
; GFX942-VGPR-NEXT: s_nop 7
753-
; GFX942-VGPR-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
753+
; GFX942-VGPR-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
754754
; GFX942-VGPR-NEXT: s_endpgm
755755
bb:
756756
%mai.1 = tail call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %a, double %b, double 0.0, i32 0, i32 0, i32 0)
@@ -1629,20 +1629,20 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
16291629
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, 0x3ff00000
16301630
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0
16311631
; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
1632-
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v10, s2
1633-
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v11, s3
1632+
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v16, s2
1633+
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v17, s3
1634+
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, v0
1635+
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0
16341636
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v3, v0
16351637
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, v0
16361638
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v0
1637-
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0
1638-
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, v0
1639-
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
1640-
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1]
1641-
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
1642-
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
1643-
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
1639+
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[14:15], v[6:7], v[6:7] op_sel:[0,1]
1640+
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[18:19], s[6:7], s[6:7] op_sel:[0,1]
1641+
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], v[4:5], v[4:5] op_sel:[0,1]
1642+
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1]
1643+
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
16441644
; GFX90A-VGPR-NEXT: s_nop 1
1645-
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9]
1645+
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[8:15], v[16:17], v[18:19], v[8:15]
16461646
; GFX90A-VGPR-NEXT: s_nop 15
16471647
; GFX90A-VGPR-NEXT: s_nop 1
16481648
; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[12:15], s[0:1] offset:16
@@ -1657,20 +1657,20 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
16571657
; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, 0x3ff00000
16581658
; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0
16591659
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
1660-
; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, s2
1661-
; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, s3
1660+
; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, s2
1661+
; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, s3
1662+
; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0
1663+
; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
16621664
; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0
16631665
; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0
16641666
; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v0
1665-
; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0
1666-
; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0
1667-
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7]
1668-
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[6:7]
1669-
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
1670-
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
1671-
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
1667+
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], v[6:7]
1668+
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[18:19], s[6:7]
1669+
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], v[4:5]
1670+
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], v[2:3]
1671+
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[0:1]
16721672
; GFX942-VGPR-NEXT: s_nop 1
1673-
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9]
1673+
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[8:15], v[16:17], v[18:19], v[8:15]
16741674
; GFX942-VGPR-NEXT: s_nop 15
16751675
; GFX942-VGPR-NEXT: s_nop 1
16761676
; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[12:15], s[0:1] offset:16
@@ -1743,20 +1743,20 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
17431743
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, 0x405ec000
17441744
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0
17451745
; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
1746-
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v10, s2
1747-
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v11, s3
1746+
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v16, s2
1747+
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v17, s3
1748+
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v1
1749+
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0
17481750
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v3, v1
17491751
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, v0
17501752
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v1
1751-
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0
1752-
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v1
1753-
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
1754-
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1]
1755-
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
1756-
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
1757-
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
1753+
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[14:15], v[6:7], v[6:7] op_sel:[0,1]
1754+
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[18:19], s[6:7], s[6:7] op_sel:[0,1]
1755+
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], v[4:5], v[4:5] op_sel:[0,1]
1756+
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1]
1757+
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
17581758
; GFX90A-VGPR-NEXT: s_nop 1
1759-
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9]
1759+
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[8:15], v[16:17], v[18:19], v[8:15]
17601760
; GFX90A-VGPR-NEXT: s_nop 15
17611761
; GFX90A-VGPR-NEXT: s_nop 1
17621762
; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[12:15], s[0:1] offset:16
@@ -1771,20 +1771,20 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
17711771
; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 0x405ec000
17721772
; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0
17731773
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
1774-
; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, s2
1775-
; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, s3
1774+
; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, s2
1775+
; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, s3
1776+
; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v1
1777+
; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
17761778
; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v1
17771779
; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0
17781780
; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v1
1779-
; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0
1780-
; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v1
1781-
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7]
1782-
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[6:7]
1783-
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
1784-
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
1785-
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
1781+
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], v[6:7]
1782+
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[18:19], s[6:7]
1783+
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], v[4:5]
1784+
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], v[2:3]
1785+
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[0:1]
17861786
; GFX942-VGPR-NEXT: s_nop 1
1787-
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9]
1787+
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[8:15], v[16:17], v[18:19], v[8:15]
17881788
; GFX942-VGPR-NEXT: s_nop 15
17891789
; GFX942-VGPR-NEXT: s_nop 1
17901790
; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[12:15], s[0:1] offset:16

0 commit comments

Comments
 (0)