diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 007b481f84960..b83e62f4ea57c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -541,6 +541,17 @@ extern char &GCNRewritePartialRegUsesID; void initializeAMDGPUWaitSGPRHazardsLegacyPass(PassRegistry &); extern char &AMDGPUWaitSGPRHazardsLegacyID; +class AMDGPUEliminateAGPRToVGPRCopyPass + : public PassInfoMixin { +public: + AMDGPUEliminateAGPRToVGPRCopyPass() = default; + PreservedAnalyses run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM); +}; + +void initializeAMDGPUEliminateAGPRToVGPRCopyLegacyPass(PassRegistry &); +extern char &AMDGPUEliminateAGPRToVGPRCopyLegacyID; + class AMDGPURewriteAGPRCopyMFMAPass : public PassInfoMixin { public: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUEliminateAGPRToVGPRCopy.cpp b/llvm/lib/Target/AMDGPU/AMDGPUEliminateAGPRToVGPRCopy.cpp new file mode 100644 index 0000000000000..5dc1430eaf771 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUEliminateAGPRToVGPRCopy.cpp @@ -0,0 +1,247 @@ +//===-- AMDGPUEliminateAGPRToVGPRCopy.cpp ---------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file \brief TODO +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "SIMachineFunctionInfo.h" +#include "SIRegisterInfo.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/LiveRegMatrix.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/CodeGen/VirtRegMap.h" +#include "llvm/InitializePasses.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-eliminate-agpr-to-vgpr-copy" + +STATISTIC(NumEliminated, "Number of copies eliminated"); + +namespace { + +class AMDGPUEliminateAGPRToVGPRCopyImpl { + const GCNSubtarget &ST; + const SIInstrInfo &TII; + const SIRegisterInfo &TRI; + MachineRegisterInfo &MRI; + VirtRegMap &VRM; + LiveRegMatrix &LRM; + LiveIntervals &LIS; + +public: + AMDGPUEliminateAGPRToVGPRCopyImpl(MachineFunction &MF, VirtRegMap &VRM, + LiveRegMatrix &LRM, LiveIntervals &LIS) + : ST(MF.getSubtarget()), TII(*ST.getInstrInfo()), + TRI(*ST.getRegisterInfo()), MRI(MF.getRegInfo()), VRM(VRM), LRM(LRM), + LIS(LIS) {} + + bool areAllUsesCompatible(Register Reg) const; + + bool run(MachineFunction &MF) const; +}; + +bool AMDGPUEliminateAGPRToVGPRCopyImpl::areAllUsesCompatible( + Register Reg) const { + return all_of(MRI.use_operands(Reg), [&](const MachineOperand &MO) { + const MachineInstr &ParentMI = *MO.getParent(); + if (!SIInstrInfo::isMFMA(ParentMI)) + return false; + return &MO == TII.getNamedOperand(ParentMI, AMDGPU::OpName::src0) || + &MO == TII.getNamedOperand(ParentMI, AMDGPU::OpName::src1); + }); +} + +bool AMDGPUEliminateAGPRToVGPRCopyImpl::run(MachineFunction &MF) const { + // This only applies on subtargets that have a configurable AGPR vs. VGPR + // allocation. + if (!ST.hasGFX90AInsts()) + return false; + + // Early exit if no AGPRs were assigned. + if (!LRM.isPhysRegUsed(AMDGPU::AGPR0)) + return false; + + bool MadeChange = false; + + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &CopyMI : make_early_inc_range(MBB)) { + // Find full copies... + if (!CopyMI.isFullCopy()) + continue; + + // ... whose destination was mapped to a VGPR or AGPR... + Register DstReg = CopyMI.getOperand(0).getReg(); + if (!DstReg.isVirtual()) + continue; + Register DstPhysReg = VRM.getPhys(DstReg); + if (!DstPhysReg) + continue; + const TargetRegisterClass *DstRC = TRI.getPhysRegBaseClass(DstPhysReg); + if (!TRI.hasVectorRegisters(DstRC) || TRI.hasSGPRs(DstRC)) + continue; + + // ... and whose source was mapped to an AGPR. + Register SrcReg = CopyMI.getOperand(1).getReg(); + if (!SrcReg.isVirtual() || SrcReg == DstReg) + continue; + Register SrcPhysReg = VRM.getPhys(SrcReg); + if (!SrcPhysReg) + continue; + const TargetRegisterClass *SrcRC = TRI.getPhysRegBaseClass(SrcPhysReg); + if (!TRI.isAGPRClass(SrcRC)) + continue; + + bool DstIsAGPR = TRI.hasAGPRs(DstRC); + + LLVM_DEBUG({ + dbgs() << "AGPR->AVGPR copy: " << CopyMI; + dbgs() << " " + << printReg(DstReg, &TRI, CopyMI.getOperand(0).getSubReg(), &MRI) + << " <-> " << printReg(DstPhysReg, &TRI, 0, &MRI) << "\n"; + dbgs() << " " + << printReg(SrcReg, &TRI, CopyMI.getOperand(1).getSubReg(), &MRI) + << " <-> " << printReg(SrcPhysReg, &TRI, 0, &MRI) << "\n"; + }); + + LiveInterval &SrcLI = LIS.getInterval(SrcReg); + const VNInfo *SrcVNI = SrcLI.getVNInfoAt(LIS.getInstructionIndex(CopyMI)); + assert(SrcVNI && "VNI must exist"); + + bool AllUsesCompatible = + all_of(MRI.use_operands(DstReg), [&](const MachineOperand &MO) { + // Destination's use must be src0/src1 operands of an MFMA or + // another copy. + const MachineInstr &UseMI = *MO.getParent(); + if (!DstIsAGPR) { + if (SIInstrInfo::isMFMA(UseMI)) { + if (&MO != TII.getNamedOperand(UseMI, AMDGPU::OpName::src0) && + &MO != TII.getNamedOperand(UseMI, AMDGPU::OpName::src1)) { + LLVM_DEBUG(dbgs() + << " Incompatible MFMA operand: " << UseMI); + return false; + } + } else if (!UseMI.isFullCopy()) { + LLVM_DEBUG(dbgs() << " Incompatible user: " << UseMI); + return false; + } + } else { + LLVM_DEBUG(dbgs() << " Skipping user check (dst is AGPR)\n"); + } + + // Source must be available at use point. + const VNInfo *UseVNI = + SrcLI.getVNInfoAt(LIS.getInstructionIndex(UseMI)); + if (SrcVNI != UseVNI) { + LLVM_DEBUG(dbgs() << " AGPR no longer available at " << UseMI); + } + return true; + }); + if (!AllUsesCompatible) + continue; + + LLVM_DEBUG(dbgs() << " -> Eliminated\n"); + ++NumEliminated; + + // Remove the copy's destination register. + MRI.replaceRegWith(DstReg, SrcReg); + LRM.unassign(LIS.getInterval(DstReg)); + LIS.removeInterval(DstReg); + + // Delete the copy instruction. + LIS.RemoveMachineInstrFromMaps(CopyMI); + CopyMI.eraseFromParent(); + + // Recompute the source register's interval. + // TODO: necessary? It is already live at all uses by construction. + LIS.removeInterval(SrcReg); + LIS.createAndComputeVirtRegInterval(SrcReg); + MadeChange = true; + } + } + + return MadeChange; +} + +class AMDGPUEliminateAGPRToVGPRCopyLegacy : public MachineFunctionPass { +public: + static char ID; + + AMDGPUEliminateAGPRToVGPRCopyLegacy() : MachineFunctionPass(ID) { + initializeAMDGPUEliminateAGPRToVGPRCopyLegacyPass( + *PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { + return "AMDGPU Eliminate AGPR-to-VGPR Copy"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(AMDGPUEliminateAGPRToVGPRCopyLegacy, DEBUG_TYPE, + "AMDGPU Eliminate AGPR-to-VGPR Copy", false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(VirtRegMapWrapperLegacy) +INITIALIZE_PASS_DEPENDENCY(LiveRegMatrixWrapperLegacy) +INITIALIZE_PASS_END(AMDGPUEliminateAGPRToVGPRCopyLegacy, DEBUG_TYPE, + "AMDGPU Eliminate AGPR-to-VGPR Copy", false, false) + +char AMDGPUEliminateAGPRToVGPRCopyLegacy::ID = 0; + +char &llvm::AMDGPUEliminateAGPRToVGPRCopyLegacyID = + AMDGPUEliminateAGPRToVGPRCopyLegacy::ID; + +bool AMDGPUEliminateAGPRToVGPRCopyLegacy::runOnMachineFunction( + MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + + auto &VRM = getAnalysis().getVRM(); + auto &LRM = getAnalysis().getLRM(); + auto &LIS = getAnalysis().getLIS(); + + AMDGPUEliminateAGPRToVGPRCopyImpl Impl(MF, VRM, LRM, LIS); + return Impl.run(MF); +} + +PreservedAnalyses +AMDGPUEliminateAGPRToVGPRCopyPass::run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM) { + VirtRegMap &VRM = MFAM.getResult(MF); + LiveRegMatrix &LRM = MFAM.getResult(MF); + LiveIntervals &LIS = MFAM.getResult(MF); + + AMDGPUEliminateAGPRToVGPRCopyImpl Impl(MF, VRM, LRM, LIS); + if (!Impl.run(MF)) + return PreservedAnalyses::all(); + auto PA = getMachineFunctionPassPreservedAnalyses(); + PA.preserveSet(); + return PA; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index b6c6d927d0e89..34963826d16a5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -106,6 +106,7 @@ MACHINE_FUNCTION_ANALYSIS("amdgpu-resource-usage", AMDGPUResourceUsageAnalysis(* #endif MACHINE_FUNCTION_PASS("amdgpu-insert-delay-alu", AMDGPUInsertDelayAluPass()) MACHINE_FUNCTION_PASS("amdgpu-isel", AMDGPUISelDAGToDAGPass(*this)) +MACHINE_FUNCTION_PASS("amdgpu-eliminate-agpr-to-vgpr-copy", AMDGPUEliminateAGPRToVGPRCopyPass()) MACHINE_FUNCTION_PASS("amdgpu-mark-last-scratch-load", AMDGPUMarkLastScratchLoadPass()) MACHINE_FUNCTION_PASS("amdgpu-pre-ra-long-branch-reg", GCNPreRALongBranchRegPass()) MACHINE_FUNCTION_PASS("amdgpu-reserve-wwm-regs", AMDGPUReserveWWMRegsPass()) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index c1f17033d04a8..5512b15f4cb31 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -528,6 +528,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPULowerKernelArgumentsPass(*PR); initializeAMDGPUPromoteKernelArgumentsPass(*PR); initializeAMDGPULowerKernelAttributesPass(*PR); + initializeAMDGPUEliminateAGPRToVGPRCopyLegacyPass(*PR); initializeAMDGPUExportKernelRuntimeHandlesLegacyPass(*PR); initializeAMDGPUPostLegalizerCombinerPass(*PR); initializeAMDGPUPreLegalizerCombinerPass(*PR); @@ -1594,6 +1595,7 @@ bool GCNPassConfig::addPreRewrite() { if (EnableRegReassign) addPass(&GCNNSAReassignID); + addPass(&AMDGPUEliminateAGPRToVGPRCopyLegacyID); addPass(&AMDGPURewriteAGPRCopyMFMALegacyID); return true; } diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index c466f9cf0f359..abc8020920e7d 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -53,6 +53,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUCodeGenPrepare.cpp AMDGPUCombinerHelper.cpp AMDGPUCtorDtorLowering.cpp + AMDGPUEliminateAGPRToVGPRCopy.cpp AMDGPUExportClustering.cpp AMDGPUExportKernelRuntimeHandles.cpp AMDGPUFrameLowering.cpp diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index 2a5c65278f7dc..ed865dc62592b 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -377,6 +377,7 @@ ; GCN-O1-NEXT: Live Register Matrix ; GCN-O1-NEXT: Greedy Register Allocator ; GCN-O1-NEXT: GCN NSA Reassign +; GCN-O1-NEXT: AMDGPU Eliminate AGPR-to-VGPR Copy ; GCN-O1-NEXT: AMDGPU Rewrite AGPR-Copy-MFMA ; GCN-O1-NEXT: Virtual Register Rewriter ; GCN-O1-NEXT: AMDGPU Mark Last Scratch Load @@ -689,6 +690,7 @@ ; GCN-O1-OPTS-NEXT: Live Register Matrix ; GCN-O1-OPTS-NEXT: Greedy Register Allocator ; GCN-O1-OPTS-NEXT: GCN NSA Reassign +; GCN-O1-OPTS-NEXT: AMDGPU Eliminate AGPR-to-VGPR Copy ; GCN-O1-OPTS-NEXT: AMDGPU Rewrite AGPR-Copy-MFMA ; GCN-O1-OPTS-NEXT: Virtual Register Rewriter ; GCN-O1-OPTS-NEXT: AMDGPU Mark Last Scratch Load @@ -1007,6 +1009,7 @@ ; GCN-O2-NEXT: Live Register Matrix ; GCN-O2-NEXT: Greedy Register Allocator ; GCN-O2-NEXT: GCN NSA Reassign +; GCN-O2-NEXT: AMDGPU Eliminate AGPR-to-VGPR Copy ; GCN-O2-NEXT: AMDGPU Rewrite AGPR-Copy-MFMA ; GCN-O2-NEXT: Virtual Register Rewriter ; GCN-O2-NEXT: AMDGPU Mark Last Scratch Load @@ -1338,6 +1341,7 @@ ; GCN-O3-NEXT: Live Register Matrix ; GCN-O3-NEXT: Greedy Register Allocator ; GCN-O3-NEXT: GCN NSA Reassign +; GCN-O3-NEXT: AMDGPU Eliminate AGPR-to-VGPR Copy ; GCN-O3-NEXT: AMDGPU Rewrite AGPR-Copy-MFMA ; GCN-O3-NEXT: Virtual Register Rewriter ; GCN-O3-NEXT: AMDGPU Mark Last Scratch Load diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll index 6110b3101020a..fdf4dfd1eaefa 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll @@ -103,6 +103,9 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 { ; GFX90A-LABEL: test_mfma_loop_zeroinit: ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0 +; GFX90A-NEXT: s_mov_b32 s0, 16 +; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0 @@ -134,14 +137,42 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 { ; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0 -; GFX90A-NEXT: s_mov_b32 s0, 16 -; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX90A-NEXT: .LBB0_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GFX90A-NEXT: ; kill: def $agpr30 killed $agpr30 +; GFX90A-NEXT: ; kill: def $agpr29 killed $agpr29 +; GFX90A-NEXT: ; kill: def $agpr28 killed $agpr28 +; GFX90A-NEXT: ; kill: def $agpr27 killed $agpr27 +; GFX90A-NEXT: ; kill: def $agpr26 killed $agpr26 +; GFX90A-NEXT: ; kill: def $agpr25 killed $agpr25 +; GFX90A-NEXT: ; kill: def $agpr24 killed $agpr24 +; GFX90A-NEXT: ; kill: def $agpr23 killed $agpr23 +; GFX90A-NEXT: ; kill: def $agpr22 killed $agpr22 +; GFX90A-NEXT: ; kill: def $agpr21 killed $agpr21 +; GFX90A-NEXT: ; kill: def $agpr20 killed $agpr20 +; GFX90A-NEXT: ; kill: def $agpr19 killed $agpr19 +; GFX90A-NEXT: ; kill: def $agpr18 killed $agpr18 +; GFX90A-NEXT: ; kill: def $agpr17 killed $agpr17 +; GFX90A-NEXT: ; kill: def $agpr16 killed $agpr16 +; GFX90A-NEXT: ; kill: def $agpr15 killed $agpr15 +; GFX90A-NEXT: ; kill: def $agpr14 killed $agpr14 +; GFX90A-NEXT: ; kill: def $agpr13 killed $agpr13 +; GFX90A-NEXT: ; kill: def $agpr12 killed $agpr12 +; GFX90A-NEXT: ; kill: def $agpr11 killed $agpr11 +; GFX90A-NEXT: ; kill: def $agpr10 killed $agpr10 +; GFX90A-NEXT: ; kill: def $agpr9 killed $agpr9 +; GFX90A-NEXT: ; kill: def $agpr8 killed $agpr8 +; GFX90A-NEXT: ; kill: def $agpr7 killed $agpr7 +; GFX90A-NEXT: ; kill: def $agpr6 killed $agpr6 +; GFX90A-NEXT: ; kill: def $agpr5 killed $agpr5 +; GFX90A-NEXT: ; kill: def $agpr4 killed $agpr4 +; GFX90A-NEXT: ; kill: def $agpr3 killed $agpr3 +; GFX90A-NEXT: ; kill: def $agpr2 killed $agpr2 +; GFX90A-NEXT: ; kill: def $agpr1 killed $agpr1 +; GFX90A-NEXT: ; kill: def $agpr0 killed $agpr0 ; GFX90A-NEXT: s_add_i32 s0, s0, -1 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] ; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 ; GFX90A-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %exit @@ -149,7 +180,7 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 { ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 4 +; GFX90A-NEXT: s_nop 5 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -163,6 +194,9 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 { ; GFX942-LABEL: test_mfma_loop_zeroinit: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: v_accvgpr_write_b32 a31, 0 +; GFX942-NEXT: s_mov_b32 s0, 16 +; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 +; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX942-NEXT: v_accvgpr_write_b32 a30, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a29, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a28, 0 @@ -194,14 +228,42 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 { ; GFX942-NEXT: v_accvgpr_write_b32 a2, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a1, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a0, 0 -; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX942-NEXT: .LBB0_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31] +; GFX942-NEXT: ; kill: def $agpr30 killed $agpr30 +; GFX942-NEXT: ; kill: def $agpr29 killed $agpr29 +; GFX942-NEXT: ; kill: def $agpr28 killed $agpr28 +; GFX942-NEXT: ; kill: def $agpr27 killed $agpr27 +; GFX942-NEXT: ; kill: def $agpr26 killed $agpr26 +; GFX942-NEXT: ; kill: def $agpr25 killed $agpr25 +; GFX942-NEXT: ; kill: def $agpr24 killed $agpr24 +; GFX942-NEXT: ; kill: def $agpr23 killed $agpr23 +; GFX942-NEXT: ; kill: def $agpr22 killed $agpr22 +; GFX942-NEXT: ; kill: def $agpr21 killed $agpr21 +; GFX942-NEXT: ; kill: def $agpr20 killed $agpr20 +; GFX942-NEXT: ; kill: def $agpr19 killed $agpr19 +; GFX942-NEXT: ; kill: def $agpr18 killed $agpr18 +; GFX942-NEXT: ; kill: def $agpr17 killed $agpr17 +; GFX942-NEXT: ; kill: def $agpr16 killed $agpr16 +; GFX942-NEXT: ; kill: def $agpr15 killed $agpr15 +; GFX942-NEXT: ; kill: def $agpr14 killed $agpr14 +; GFX942-NEXT: ; kill: def $agpr13 killed $agpr13 +; GFX942-NEXT: ; kill: def $agpr12 killed $agpr12 +; GFX942-NEXT: ; kill: def $agpr11 killed $agpr11 +; GFX942-NEXT: ; kill: def $agpr10 killed $agpr10 +; GFX942-NEXT: ; kill: def $agpr9 killed $agpr9 +; GFX942-NEXT: ; kill: def $agpr8 killed $agpr8 +; GFX942-NEXT: ; kill: def $agpr7 killed $agpr7 +; GFX942-NEXT: ; kill: def $agpr6 killed $agpr6 +; GFX942-NEXT: ; kill: def $agpr5 killed $agpr5 +; GFX942-NEXT: ; kill: def $agpr4 killed $agpr4 +; GFX942-NEXT: ; kill: def $agpr3 killed $agpr3 +; GFX942-NEXT: ; kill: def $agpr2 killed $agpr2 +; GFX942-NEXT: ; kill: def $agpr1 killed $agpr1 +; GFX942-NEXT: ; kill: def $agpr0 killed $agpr0 ; GFX942-NEXT: s_add_i32 s0, s0, -1 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31] ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 ; GFX942-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX942-NEXT: ; %bb.2: ; %exit @@ -209,7 +271,7 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 { ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 3 +; GFX942-NEXT: s_nop 4 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -338,45 +400,45 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42f60000 ; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_mov_b32 s0, 16 ; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX90A-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GFX90A-NEXT: ; kill: def $agpr30 killed $agpr30 +; GFX90A-NEXT: ; kill: def $agpr29 killed $agpr29 +; GFX90A-NEXT: ; kill: def $agpr28 killed $agpr28 +; GFX90A-NEXT: ; kill: def $agpr27 killed $agpr27 +; GFX90A-NEXT: ; kill: def $agpr26 killed $agpr26 +; GFX90A-NEXT: ; kill: def $agpr25 killed $agpr25 +; GFX90A-NEXT: ; kill: def $agpr24 killed $agpr24 +; GFX90A-NEXT: ; kill: def $agpr23 killed $agpr23 +; GFX90A-NEXT: ; kill: def $agpr22 killed $agpr22 +; GFX90A-NEXT: ; kill: def $agpr21 killed $agpr21 +; GFX90A-NEXT: ; kill: def $agpr20 killed $agpr20 +; GFX90A-NEXT: ; kill: def $agpr19 killed $agpr19 +; GFX90A-NEXT: ; kill: def $agpr18 killed $agpr18 +; GFX90A-NEXT: ; kill: def $agpr17 killed $agpr17 +; GFX90A-NEXT: ; kill: def $agpr16 killed $agpr16 +; GFX90A-NEXT: ; kill: def $agpr15 killed $agpr15 +; GFX90A-NEXT: ; kill: def $agpr14 killed $agpr14 +; GFX90A-NEXT: ; kill: def $agpr13 killed $agpr13 +; GFX90A-NEXT: ; kill: def $agpr12 killed $agpr12 +; GFX90A-NEXT: ; kill: def $agpr11 killed $agpr11 +; GFX90A-NEXT: ; kill: def $agpr10 killed $agpr10 +; GFX90A-NEXT: ; kill: def $agpr9 killed $agpr9 +; GFX90A-NEXT: ; kill: def $agpr8 killed $agpr8 +; GFX90A-NEXT: ; kill: def $agpr7 killed $agpr7 +; GFX90A-NEXT: ; kill: def $agpr6 killed $agpr6 +; GFX90A-NEXT: ; kill: def $agpr5 killed $agpr5 +; GFX90A-NEXT: ; kill: def $agpr4 killed $agpr4 +; GFX90A-NEXT: ; kill: def $agpr3 killed $agpr3 +; GFX90A-NEXT: ; kill: def $agpr2 killed $agpr2 +; GFX90A-NEXT: ; kill: def $agpr1 killed $agpr1 +; GFX90A-NEXT: ; kill: def $agpr0 killed $agpr0 ; GFX90A-NEXT: s_add_i32 s0, s0, -1 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] ; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 ; GFX90A-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %exit @@ -384,7 +446,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 4 +; GFX90A-NEXT: s_nop 5 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -399,45 +461,45 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: v_mov_b32_e32 v0, 0x42f60000 ; GFX942-NEXT: v_accvgpr_write_b32 a31, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a30, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a29, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a28, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a27, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a26, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a25, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a24, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a23, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a22, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a21, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a20, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a19, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a18, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a17, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a16, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a15, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a14, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a13, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a12, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a11, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a10, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a9, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a8, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a7, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a6, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a5, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a4, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a3, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a2, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX942-NEXT: s_mov_b32 s0, 16 ; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX942-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31] +; GFX942-NEXT: ; kill: def $agpr30 killed $agpr30 +; GFX942-NEXT: ; kill: def $agpr29 killed $agpr29 +; GFX942-NEXT: ; kill: def $agpr28 killed $agpr28 +; GFX942-NEXT: ; kill: def $agpr27 killed $agpr27 +; GFX942-NEXT: ; kill: def $agpr26 killed $agpr26 +; GFX942-NEXT: ; kill: def $agpr25 killed $agpr25 +; GFX942-NEXT: ; kill: def $agpr24 killed $agpr24 +; GFX942-NEXT: ; kill: def $agpr23 killed $agpr23 +; GFX942-NEXT: ; kill: def $agpr22 killed $agpr22 +; GFX942-NEXT: ; kill: def $agpr21 killed $agpr21 +; GFX942-NEXT: ; kill: def $agpr20 killed $agpr20 +; GFX942-NEXT: ; kill: def $agpr19 killed $agpr19 +; GFX942-NEXT: ; kill: def $agpr18 killed $agpr18 +; GFX942-NEXT: ; kill: def $agpr17 killed $agpr17 +; GFX942-NEXT: ; kill: def $agpr16 killed $agpr16 +; GFX942-NEXT: ; kill: def $agpr15 killed $agpr15 +; GFX942-NEXT: ; kill: def $agpr14 killed $agpr14 +; GFX942-NEXT: ; kill: def $agpr13 killed $agpr13 +; GFX942-NEXT: ; kill: def $agpr12 killed $agpr12 +; GFX942-NEXT: ; kill: def $agpr11 killed $agpr11 +; GFX942-NEXT: ; kill: def $agpr10 killed $agpr10 +; GFX942-NEXT: ; kill: def $agpr9 killed $agpr9 +; GFX942-NEXT: ; kill: def $agpr8 killed $agpr8 +; GFX942-NEXT: ; kill: def $agpr7 killed $agpr7 +; GFX942-NEXT: ; kill: def $agpr6 killed $agpr6 +; GFX942-NEXT: ; kill: def $agpr5 killed $agpr5 +; GFX942-NEXT: ; kill: def $agpr4 killed $agpr4 +; GFX942-NEXT: ; kill: def $agpr3 killed $agpr3 +; GFX942-NEXT: ; kill: def $agpr2 killed $agpr2 +; GFX942-NEXT: ; kill: def $agpr1 killed $agpr1 +; GFX942-NEXT: ; kill: def $agpr0 killed $agpr0 ; GFX942-NEXT: s_add_i32 s0, s0, -1 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31] ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 ; GFX942-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX942-NEXT: ; %bb.2: ; %exit @@ -445,7 +507,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 3 +; GFX942-NEXT: s_nop 4 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -566,8 +628,11 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 { ; ; GFX90A-LABEL: test_mfma_loop_non_splat: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: v_accvgpr_write_b32 a1, 1.0 ; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0 +; GFX90A-NEXT: s_mov_b32 s0, 16 +; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, 1.0 ; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0 @@ -598,14 +663,42 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 { ; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0 -; GFX90A-NEXT: s_mov_b32 s0, 16 -; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX90A-NEXT: .LBB2_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GFX90A-NEXT: ; kill: def $agpr30 killed $agpr30 +; GFX90A-NEXT: ; kill: def $agpr29 killed $agpr29 +; GFX90A-NEXT: ; kill: def $agpr28 killed $agpr28 +; GFX90A-NEXT: ; kill: def $agpr27 killed $agpr27 +; GFX90A-NEXT: ; kill: def $agpr26 killed $agpr26 +; GFX90A-NEXT: ; kill: def $agpr25 killed $agpr25 +; GFX90A-NEXT: ; kill: def $agpr24 killed $agpr24 +; GFX90A-NEXT: ; kill: def $agpr23 killed $agpr23 +; GFX90A-NEXT: ; kill: def $agpr22 killed $agpr22 +; GFX90A-NEXT: ; kill: def $agpr21 killed $agpr21 +; GFX90A-NEXT: ; kill: def $agpr20 killed $agpr20 +; GFX90A-NEXT: ; kill: def $agpr19 killed $agpr19 +; GFX90A-NEXT: ; kill: def $agpr18 killed $agpr18 +; GFX90A-NEXT: ; kill: def $agpr17 killed $agpr17 +; GFX90A-NEXT: ; kill: def $agpr16 killed $agpr16 +; GFX90A-NEXT: ; kill: def $agpr15 killed $agpr15 +; GFX90A-NEXT: ; kill: def $agpr14 killed $agpr14 +; GFX90A-NEXT: ; kill: def $agpr13 killed $agpr13 +; GFX90A-NEXT: ; kill: def $agpr12 killed $agpr12 +; GFX90A-NEXT: ; kill: def $agpr11 killed $agpr11 +; GFX90A-NEXT: ; kill: def $agpr10 killed $agpr10 +; GFX90A-NEXT: ; kill: def $agpr9 killed $agpr9 +; GFX90A-NEXT: ; kill: def $agpr8 killed $agpr8 +; GFX90A-NEXT: ; kill: def $agpr7 killed $agpr7 +; GFX90A-NEXT: ; kill: def $agpr6 killed $agpr6 +; GFX90A-NEXT: ; kill: def $agpr5 killed $agpr5 +; GFX90A-NEXT: ; kill: def $agpr4 killed $agpr4 +; GFX90A-NEXT: ; kill: def $agpr3 killed $agpr3 +; GFX90A-NEXT: ; kill: def $agpr2 killed $agpr2 +; GFX90A-NEXT: ; kill: def $agpr1 killed $agpr1 +; GFX90A-NEXT: ; kill: def $agpr0 killed $agpr0 ; GFX90A-NEXT: s_add_i32 s0, s0, -1 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 ; GFX90A-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX90A-NEXT: ; %bb.2: ; %exit @@ -613,7 +706,7 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 { ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 4 +; GFX90A-NEXT: s_nop 5 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -626,8 +719,11 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 { ; ; GFX942-LABEL: test_mfma_loop_non_splat: ; GFX942: ; %bb.0: ; %entry -; GFX942-NEXT: v_accvgpr_write_b32 a1, 1.0 ; GFX942-NEXT: v_accvgpr_write_b32 a31, 0 +; GFX942-NEXT: s_mov_b32 s0, 16 +; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX942-NEXT: v_accvgpr_write_b32 a1, 1.0 ; GFX942-NEXT: v_accvgpr_write_b32 a30, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a29, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a28, 0 @@ -658,14 +754,42 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 { ; GFX942-NEXT: v_accvgpr_write_b32 a3, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a2, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a0, 0 -; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX942-NEXT: .LBB2_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31] +; GFX942-NEXT: ; kill: def $agpr30 killed $agpr30 +; GFX942-NEXT: ; kill: def $agpr29 killed $agpr29 +; GFX942-NEXT: ; kill: def $agpr28 killed $agpr28 +; GFX942-NEXT: ; kill: def $agpr27 killed $agpr27 +; GFX942-NEXT: ; kill: def $agpr26 killed $agpr26 +; GFX942-NEXT: ; kill: def $agpr25 killed $agpr25 +; GFX942-NEXT: ; kill: def $agpr24 killed $agpr24 +; GFX942-NEXT: ; kill: def $agpr23 killed $agpr23 +; GFX942-NEXT: ; kill: def $agpr22 killed $agpr22 +; GFX942-NEXT: ; kill: def $agpr21 killed $agpr21 +; GFX942-NEXT: ; kill: def $agpr20 killed $agpr20 +; GFX942-NEXT: ; kill: def $agpr19 killed $agpr19 +; GFX942-NEXT: ; kill: def $agpr18 killed $agpr18 +; GFX942-NEXT: ; kill: def $agpr17 killed $agpr17 +; GFX942-NEXT: ; kill: def $agpr16 killed $agpr16 +; GFX942-NEXT: ; kill: def $agpr15 killed $agpr15 +; GFX942-NEXT: ; kill: def $agpr14 killed $agpr14 +; GFX942-NEXT: ; kill: def $agpr13 killed $agpr13 +; GFX942-NEXT: ; kill: def $agpr12 killed $agpr12 +; GFX942-NEXT: ; kill: def $agpr11 killed $agpr11 +; GFX942-NEXT: ; kill: def $agpr10 killed $agpr10 +; GFX942-NEXT: ; kill: def $agpr9 killed $agpr9 +; GFX942-NEXT: ; kill: def $agpr8 killed $agpr8 +; GFX942-NEXT: ; kill: def $agpr7 killed $agpr7 +; GFX942-NEXT: ; kill: def $agpr6 killed $agpr6 +; GFX942-NEXT: ; kill: def $agpr5 killed $agpr5 +; GFX942-NEXT: ; kill: def $agpr4 killed $agpr4 +; GFX942-NEXT: ; kill: def $agpr3 killed $agpr3 +; GFX942-NEXT: ; kill: def $agpr2 killed $agpr2 +; GFX942-NEXT: ; kill: def $agpr1 killed $agpr1 +; GFX942-NEXT: ; kill: def $agpr0 killed $agpr0 ; GFX942-NEXT: s_add_i32 s0, s0, -1 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31] ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 ; GFX942-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX942-NEXT: ; %bb.2: ; %exit @@ -673,7 +797,7 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 { ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 3 +; GFX942-NEXT: s_nop 4 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -865,75 +989,75 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg) ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x431a0000 ; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43190000 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43180000 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43170000 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43160000 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43150000 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43140000 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43130000 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43120000 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43110000 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43100000 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430f0000 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430e0000 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430d0000 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430c0000 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430b0000 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430a0000 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43090000 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43080000 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43070000 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43060000 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43050000 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43040000 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43030000 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43020000 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43010000 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43000000 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42fe0000 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42fc0000 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42fa0000 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42f80000 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42f60000 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_mov_b32 s0, 16 ; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX90A-NEXT: .LBB3_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GFX90A-NEXT: ; kill: def $agpr30 killed $agpr30 +; GFX90A-NEXT: ; kill: def $agpr29 killed $agpr29 +; GFX90A-NEXT: ; kill: def $agpr28 killed $agpr28 +; GFX90A-NEXT: ; kill: def $agpr27 killed $agpr27 +; GFX90A-NEXT: ; kill: def $agpr26 killed $agpr26 +; GFX90A-NEXT: ; kill: def $agpr25 killed $agpr25 +; GFX90A-NEXT: ; kill: def $agpr24 killed $agpr24 +; GFX90A-NEXT: ; kill: def $agpr23 killed $agpr23 +; GFX90A-NEXT: ; kill: def $agpr22 killed $agpr22 +; GFX90A-NEXT: ; kill: def $agpr21 killed $agpr21 +; GFX90A-NEXT: ; kill: def $agpr20 killed $agpr20 +; GFX90A-NEXT: ; kill: def $agpr19 killed $agpr19 +; GFX90A-NEXT: ; kill: def $agpr18 killed $agpr18 +; GFX90A-NEXT: ; kill: def $agpr17 killed $agpr17 +; GFX90A-NEXT: ; kill: def $agpr16 killed $agpr16 +; GFX90A-NEXT: ; kill: def $agpr15 killed $agpr15 +; GFX90A-NEXT: ; kill: def $agpr14 killed $agpr14 +; GFX90A-NEXT: ; kill: def $agpr13 killed $agpr13 +; GFX90A-NEXT: ; kill: def $agpr12 killed $agpr12 +; GFX90A-NEXT: ; kill: def $agpr11 killed $agpr11 +; GFX90A-NEXT: ; kill: def $agpr10 killed $agpr10 +; GFX90A-NEXT: ; kill: def $agpr9 killed $agpr9 +; GFX90A-NEXT: ; kill: def $agpr8 killed $agpr8 +; GFX90A-NEXT: ; kill: def $agpr7 killed $agpr7 +; GFX90A-NEXT: ; kill: def $agpr6 killed $agpr6 +; GFX90A-NEXT: ; kill: def $agpr5 killed $agpr5 +; GFX90A-NEXT: ; kill: def $agpr4 killed $agpr4 +; GFX90A-NEXT: ; kill: def $agpr3 killed $agpr3 +; GFX90A-NEXT: ; kill: def $agpr2 killed $agpr2 +; GFX90A-NEXT: ; kill: def $agpr1 killed $agpr1 +; GFX90A-NEXT: ; kill: def $agpr0 killed $agpr0 ; GFX90A-NEXT: s_add_i32 s0, s0, -1 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] ; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 ; GFX90A-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX90A-NEXT: ; %bb.2: ; %exit @@ -941,7 +1065,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg) ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 4 +; GFX90A-NEXT: s_nop 5 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -957,75 +1081,75 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg) ; GFX942-NEXT: v_mov_b32_e32 v0, 0x431a0000 ; GFX942-NEXT: v_accvgpr_write_b32 a31, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x43190000 -; GFX942-NEXT: v_accvgpr_write_b32 a30, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x43180000 -; GFX942-NEXT: v_accvgpr_write_b32 a29, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x43170000 -; GFX942-NEXT: v_accvgpr_write_b32 a28, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x43160000 -; GFX942-NEXT: v_accvgpr_write_b32 a27, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x43150000 -; GFX942-NEXT: v_accvgpr_write_b32 a26, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x43140000 -; GFX942-NEXT: v_accvgpr_write_b32 a25, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x43130000 -; GFX942-NEXT: v_accvgpr_write_b32 a24, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x43120000 -; GFX942-NEXT: v_accvgpr_write_b32 a23, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x43110000 -; GFX942-NEXT: v_accvgpr_write_b32 a22, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x43100000 -; GFX942-NEXT: v_accvgpr_write_b32 a21, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x430f0000 -; GFX942-NEXT: v_accvgpr_write_b32 a20, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x430e0000 -; GFX942-NEXT: v_accvgpr_write_b32 a19, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x430d0000 -; GFX942-NEXT: v_accvgpr_write_b32 a18, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x430c0000 -; GFX942-NEXT: v_accvgpr_write_b32 a17, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x430b0000 -; GFX942-NEXT: v_accvgpr_write_b32 a16, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x430a0000 -; GFX942-NEXT: v_accvgpr_write_b32 a15, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x43090000 -; GFX942-NEXT: v_accvgpr_write_b32 a14, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x43080000 -; GFX942-NEXT: v_accvgpr_write_b32 a13, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x43070000 -; GFX942-NEXT: v_accvgpr_write_b32 a12, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x43060000 -; GFX942-NEXT: v_accvgpr_write_b32 a11, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x43050000 -; GFX942-NEXT: v_accvgpr_write_b32 a10, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x43040000 -; GFX942-NEXT: v_accvgpr_write_b32 a9, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x43030000 -; GFX942-NEXT: v_accvgpr_write_b32 a8, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x43020000 -; GFX942-NEXT: v_accvgpr_write_b32 a7, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x43010000 -; GFX942-NEXT: v_accvgpr_write_b32 a6, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x43000000 -; GFX942-NEXT: v_accvgpr_write_b32 a5, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x42fe0000 -; GFX942-NEXT: v_accvgpr_write_b32 a4, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x42fc0000 -; GFX942-NEXT: v_accvgpr_write_b32 a3, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x42fa0000 -; GFX942-NEXT: v_accvgpr_write_b32 a2, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x42f80000 -; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x42f60000 -; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX942-NEXT: s_mov_b32 s0, 16 ; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX942-NEXT: .LBB3_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31] +; GFX942-NEXT: ; kill: def $agpr30 killed $agpr30 +; GFX942-NEXT: ; kill: def $agpr29 killed $agpr29 +; GFX942-NEXT: ; kill: def $agpr28 killed $agpr28 +; GFX942-NEXT: ; kill: def $agpr27 killed $agpr27 +; GFX942-NEXT: ; kill: def $agpr26 killed $agpr26 +; GFX942-NEXT: ; kill: def $agpr25 killed $agpr25 +; GFX942-NEXT: ; kill: def $agpr24 killed $agpr24 +; GFX942-NEXT: ; kill: def $agpr23 killed $agpr23 +; GFX942-NEXT: ; kill: def $agpr22 killed $agpr22 +; GFX942-NEXT: ; kill: def $agpr21 killed $agpr21 +; GFX942-NEXT: ; kill: def $agpr20 killed $agpr20 +; GFX942-NEXT: ; kill: def $agpr19 killed $agpr19 +; GFX942-NEXT: ; kill: def $agpr18 killed $agpr18 +; GFX942-NEXT: ; kill: def $agpr17 killed $agpr17 +; GFX942-NEXT: ; kill: def $agpr16 killed $agpr16 +; GFX942-NEXT: ; kill: def $agpr15 killed $agpr15 +; GFX942-NEXT: ; kill: def $agpr14 killed $agpr14 +; GFX942-NEXT: ; kill: def $agpr13 killed $agpr13 +; GFX942-NEXT: ; kill: def $agpr12 killed $agpr12 +; GFX942-NEXT: ; kill: def $agpr11 killed $agpr11 +; GFX942-NEXT: ; kill: def $agpr10 killed $agpr10 +; GFX942-NEXT: ; kill: def $agpr9 killed $agpr9 +; GFX942-NEXT: ; kill: def $agpr8 killed $agpr8 +; GFX942-NEXT: ; kill: def $agpr7 killed $agpr7 +; GFX942-NEXT: ; kill: def $agpr6 killed $agpr6 +; GFX942-NEXT: ; kill: def $agpr5 killed $agpr5 +; GFX942-NEXT: ; kill: def $agpr4 killed $agpr4 +; GFX942-NEXT: ; kill: def $agpr3 killed $agpr3 +; GFX942-NEXT: ; kill: def $agpr2 killed $agpr2 +; GFX942-NEXT: ; kill: def $agpr1 killed $agpr1 +; GFX942-NEXT: ; kill: def $agpr0 killed $agpr0 ; GFX942-NEXT: s_add_i32 s0, s0, -1 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31] ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 ; GFX942-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX942-NEXT: ; %bb.2: ; %exit @@ -1033,7 +1157,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg) ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 3 +; GFX942-NEXT: s_nop 4 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -1156,45 +1280,45 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 { ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_mov_b32 s0, 16 ; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX90A-NEXT: .LBB4_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GFX90A-NEXT: ; kill: def $agpr30 killed $agpr30 +; GFX90A-NEXT: ; kill: def $agpr29 killed $agpr29 +; GFX90A-NEXT: ; kill: def $agpr28 killed $agpr28 +; GFX90A-NEXT: ; kill: def $agpr27 killed $agpr27 +; GFX90A-NEXT: ; kill: def $agpr26 killed $agpr26 +; GFX90A-NEXT: ; kill: def $agpr25 killed $agpr25 +; GFX90A-NEXT: ; kill: def $agpr24 killed $agpr24 +; GFX90A-NEXT: ; kill: def $agpr23 killed $agpr23 +; GFX90A-NEXT: ; kill: def $agpr22 killed $agpr22 +; GFX90A-NEXT: ; kill: def $agpr21 killed $agpr21 +; GFX90A-NEXT: ; kill: def $agpr20 killed $agpr20 +; GFX90A-NEXT: ; kill: def $agpr19 killed $agpr19 +; GFX90A-NEXT: ; kill: def $agpr18 killed $agpr18 +; GFX90A-NEXT: ; kill: def $agpr17 killed $agpr17 +; GFX90A-NEXT: ; kill: def $agpr16 killed $agpr16 +; GFX90A-NEXT: ; kill: def $agpr15 killed $agpr15 +; GFX90A-NEXT: ; kill: def $agpr14 killed $agpr14 +; GFX90A-NEXT: ; kill: def $agpr13 killed $agpr13 +; GFX90A-NEXT: ; kill: def $agpr12 killed $agpr12 +; GFX90A-NEXT: ; kill: def $agpr11 killed $agpr11 +; GFX90A-NEXT: ; kill: def $agpr10 killed $agpr10 +; GFX90A-NEXT: ; kill: def $agpr9 killed $agpr9 +; GFX90A-NEXT: ; kill: def $agpr8 killed $agpr8 +; GFX90A-NEXT: ; kill: def $agpr7 killed $agpr7 +; GFX90A-NEXT: ; kill: def $agpr6 killed $agpr6 +; GFX90A-NEXT: ; kill: def $agpr5 killed $agpr5 +; GFX90A-NEXT: ; kill: def $agpr4 killed $agpr4 +; GFX90A-NEXT: ; kill: def $agpr3 killed $agpr3 +; GFX90A-NEXT: ; kill: def $agpr2 killed $agpr2 +; GFX90A-NEXT: ; kill: def $agpr1 killed $agpr1 +; GFX90A-NEXT: ; kill: def $agpr0 killed $agpr0 ; GFX90A-NEXT: s_add_i32 s0, s0, -1 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] ; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 ; GFX90A-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX90A-NEXT: ; %bb.2: ; %exit @@ -1202,7 +1326,7 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 { ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 4 +; GFX90A-NEXT: s_nop 5 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -1217,45 +1341,45 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 { ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a31, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a30, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a29, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a28, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a27, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a26, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a25, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a24, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a23, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a22, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a21, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a20, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a19, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a18, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a17, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a16, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a15, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a14, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a13, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a12, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a11, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a10, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a9, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a8, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a7, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a6, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a5, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a4, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a3, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a2, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX942-NEXT: s_mov_b32 s0, 16 ; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX942-NEXT: .LBB4_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31] +; GFX942-NEXT: ; kill: def $agpr30 killed $agpr30 +; GFX942-NEXT: ; kill: def $agpr29 killed $agpr29 +; GFX942-NEXT: ; kill: def $agpr28 killed $agpr28 +; GFX942-NEXT: ; kill: def $agpr27 killed $agpr27 +; GFX942-NEXT: ; kill: def $agpr26 killed $agpr26 +; GFX942-NEXT: ; kill: def $agpr25 killed $agpr25 +; GFX942-NEXT: ; kill: def $agpr24 killed $agpr24 +; GFX942-NEXT: ; kill: def $agpr23 killed $agpr23 +; GFX942-NEXT: ; kill: def $agpr22 killed $agpr22 +; GFX942-NEXT: ; kill: def $agpr21 killed $agpr21 +; GFX942-NEXT: ; kill: def $agpr20 killed $agpr20 +; GFX942-NEXT: ; kill: def $agpr19 killed $agpr19 +; GFX942-NEXT: ; kill: def $agpr18 killed $agpr18 +; GFX942-NEXT: ; kill: def $agpr17 killed $agpr17 +; GFX942-NEXT: ; kill: def $agpr16 killed $agpr16 +; GFX942-NEXT: ; kill: def $agpr15 killed $agpr15 +; GFX942-NEXT: ; kill: def $agpr14 killed $agpr14 +; GFX942-NEXT: ; kill: def $agpr13 killed $agpr13 +; GFX942-NEXT: ; kill: def $agpr12 killed $agpr12 +; GFX942-NEXT: ; kill: def $agpr11 killed $agpr11 +; GFX942-NEXT: ; kill: def $agpr10 killed $agpr10 +; GFX942-NEXT: ; kill: def $agpr9 killed $agpr9 +; GFX942-NEXT: ; kill: def $agpr8 killed $agpr8 +; GFX942-NEXT: ; kill: def $agpr7 killed $agpr7 +; GFX942-NEXT: ; kill: def $agpr6 killed $agpr6 +; GFX942-NEXT: ; kill: def $agpr5 killed $agpr5 +; GFX942-NEXT: ; kill: def $agpr4 killed $agpr4 +; GFX942-NEXT: ; kill: def $agpr3 killed $agpr3 +; GFX942-NEXT: ; kill: def $agpr2 killed $agpr2 +; GFX942-NEXT: ; kill: def $agpr1 killed $agpr1 +; GFX942-NEXT: ; kill: def $agpr0 killed $agpr0 ; GFX942-NEXT: s_add_i32 s0, s0, -1 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31] ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 ; GFX942-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX942-NEXT: ; %bb.2: ; %exit @@ -1263,7 +1387,7 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 { ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 3 +; GFX942-NEXT: s_nop 4 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -1429,43 +1553,43 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v0, s1 ; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: .LBB5_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GFX90A-NEXT: ; kill: def $agpr30 killed $agpr30 +; GFX90A-NEXT: ; kill: def $agpr29 killed $agpr29 +; GFX90A-NEXT: ; kill: def $agpr28 killed $agpr28 +; GFX90A-NEXT: ; kill: def $agpr27 killed $agpr27 +; GFX90A-NEXT: ; kill: def $agpr26 killed $agpr26 +; GFX90A-NEXT: ; kill: def $agpr25 killed $agpr25 +; GFX90A-NEXT: ; kill: def $agpr24 killed $agpr24 +; GFX90A-NEXT: ; kill: def $agpr23 killed $agpr23 +; GFX90A-NEXT: ; kill: def $agpr22 killed $agpr22 +; GFX90A-NEXT: ; kill: def $agpr21 killed $agpr21 +; GFX90A-NEXT: ; kill: def $agpr20 killed $agpr20 +; GFX90A-NEXT: ; kill: def $agpr19 killed $agpr19 +; GFX90A-NEXT: ; kill: def $agpr18 killed $agpr18 +; GFX90A-NEXT: ; kill: def $agpr17 killed $agpr17 +; GFX90A-NEXT: ; kill: def $agpr16 killed $agpr16 +; GFX90A-NEXT: ; kill: def $agpr15 killed $agpr15 +; GFX90A-NEXT: ; kill: def $agpr14 killed $agpr14 +; GFX90A-NEXT: ; kill: def $agpr13 killed $agpr13 +; GFX90A-NEXT: ; kill: def $agpr12 killed $agpr12 +; GFX90A-NEXT: ; kill: def $agpr11 killed $agpr11 +; GFX90A-NEXT: ; kill: def $agpr10 killed $agpr10 +; GFX90A-NEXT: ; kill: def $agpr9 killed $agpr9 +; GFX90A-NEXT: ; kill: def $agpr8 killed $agpr8 +; GFX90A-NEXT: ; kill: def $agpr7 killed $agpr7 +; GFX90A-NEXT: ; kill: def $agpr6 killed $agpr6 +; GFX90A-NEXT: ; kill: def $agpr5 killed $agpr5 +; GFX90A-NEXT: ; kill: def $agpr4 killed $agpr4 +; GFX90A-NEXT: ; kill: def $agpr3 killed $agpr3 +; GFX90A-NEXT: ; kill: def $agpr2 killed $agpr2 +; GFX90A-NEXT: ; kill: def $agpr1 killed $agpr1 +; GFX90A-NEXT: ; kill: def $agpr0 killed $agpr0 ; GFX90A-NEXT: s_add_i32 s0, s0, -1 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] ; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 ; GFX90A-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX90A-NEXT: ; %bb.2: ; %exit @@ -1473,7 +1597,7 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 4 +; GFX90A-NEXT: s_nop 5 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -1492,43 +1616,43 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NEXT: v_accvgpr_write_b32 a31, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a30, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a29, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a28, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a27, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a26, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a25, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a24, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a23, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a22, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a21, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a20, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a19, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a18, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a17, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a16, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a15, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a14, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a13, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a12, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a11, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a10, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a9, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a8, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a7, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a6, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a5, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a4, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a3, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a2, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: .LBB5_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31] +; GFX942-NEXT: ; kill: def $agpr30 killed $agpr30 +; GFX942-NEXT: ; kill: def $agpr29 killed $agpr29 +; GFX942-NEXT: ; kill: def $agpr28 killed $agpr28 +; GFX942-NEXT: ; kill: def $agpr27 killed $agpr27 +; GFX942-NEXT: ; kill: def $agpr26 killed $agpr26 +; GFX942-NEXT: ; kill: def $agpr25 killed $agpr25 +; GFX942-NEXT: ; kill: def $agpr24 killed $agpr24 +; GFX942-NEXT: ; kill: def $agpr23 killed $agpr23 +; GFX942-NEXT: ; kill: def $agpr22 killed $agpr22 +; GFX942-NEXT: ; kill: def $agpr21 killed $agpr21 +; GFX942-NEXT: ; kill: def $agpr20 killed $agpr20 +; GFX942-NEXT: ; kill: def $agpr19 killed $agpr19 +; GFX942-NEXT: ; kill: def $agpr18 killed $agpr18 +; GFX942-NEXT: ; kill: def $agpr17 killed $agpr17 +; GFX942-NEXT: ; kill: def $agpr16 killed $agpr16 +; GFX942-NEXT: ; kill: def $agpr15 killed $agpr15 +; GFX942-NEXT: ; kill: def $agpr14 killed $agpr14 +; GFX942-NEXT: ; kill: def $agpr13 killed $agpr13 +; GFX942-NEXT: ; kill: def $agpr12 killed $agpr12 +; GFX942-NEXT: ; kill: def $agpr11 killed $agpr11 +; GFX942-NEXT: ; kill: def $agpr10 killed $agpr10 +; GFX942-NEXT: ; kill: def $agpr9 killed $agpr9 +; GFX942-NEXT: ; kill: def $agpr8 killed $agpr8 +; GFX942-NEXT: ; kill: def $agpr7 killed $agpr7 +; GFX942-NEXT: ; kill: def $agpr6 killed $agpr6 +; GFX942-NEXT: ; kill: def $agpr5 killed $agpr5 +; GFX942-NEXT: ; kill: def $agpr4 killed $agpr4 +; GFX942-NEXT: ; kill: def $agpr3 killed $agpr3 +; GFX942-NEXT: ; kill: def $agpr2 killed $agpr2 +; GFX942-NEXT: ; kill: def $agpr1 killed $agpr1 +; GFX942-NEXT: ; kill: def $agpr0 killed $agpr0 ; GFX942-NEXT: s_add_i32 s0, s0, -1 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31] ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 ; GFX942-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX942-NEXT: ; %bb.2: ; %exit @@ -1536,7 +1660,7 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 3 +; GFX942-NEXT: s_nop 4 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -1695,11 +1819,11 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: s_load_dword s1, s[4:5], 0x2c ; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0 +; GFX90A-NEXT: s_mov_b32 s0, 16 +; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, s1 ; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0 @@ -1728,18 +1852,46 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa ; GFX90A-NEXT: v_accvgpr_write_b32 a4, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0 -; GFX90A-NEXT: s_mov_b32 s0, 16 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX90A-NEXT: .LBB6_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GFX90A-NEXT: ; kill: def $agpr30 killed $agpr30 +; GFX90A-NEXT: ; kill: def $agpr29 killed $agpr29 +; GFX90A-NEXT: ; kill: def $agpr28 killed $agpr28 +; GFX90A-NEXT: ; kill: def $agpr27 killed $agpr27 +; GFX90A-NEXT: ; kill: def $agpr26 killed $agpr26 +; GFX90A-NEXT: ; kill: def $agpr25 killed $agpr25 +; GFX90A-NEXT: ; kill: def $agpr24 killed $agpr24 +; GFX90A-NEXT: ; kill: def $agpr23 killed $agpr23 +; GFX90A-NEXT: ; kill: def $agpr22 killed $agpr22 +; GFX90A-NEXT: ; kill: def $agpr21 killed $agpr21 +; GFX90A-NEXT: ; kill: def $agpr20 killed $agpr20 +; GFX90A-NEXT: ; kill: def $agpr19 killed $agpr19 +; GFX90A-NEXT: ; kill: def $agpr18 killed $agpr18 +; GFX90A-NEXT: ; kill: def $agpr17 killed $agpr17 +; GFX90A-NEXT: ; kill: def $agpr16 killed $agpr16 +; GFX90A-NEXT: ; kill: def $agpr15 killed $agpr15 +; GFX90A-NEXT: ; kill: def $agpr14 killed $agpr14 +; GFX90A-NEXT: ; kill: def $agpr13 killed $agpr13 +; GFX90A-NEXT: ; kill: def $agpr12 killed $agpr12 +; GFX90A-NEXT: ; kill: def $agpr11 killed $agpr11 +; GFX90A-NEXT: ; kill: def $agpr10 killed $agpr10 +; GFX90A-NEXT: ; kill: def $agpr9 killed $agpr9 +; GFX90A-NEXT: ; kill: def $agpr8 killed $agpr8 +; GFX90A-NEXT: ; kill: def $agpr7 killed $agpr7 +; GFX90A-NEXT: ; kill: def $agpr6 killed $agpr6 +; GFX90A-NEXT: ; kill: def $agpr5 killed $agpr5 +; GFX90A-NEXT: ; kill: def $agpr4 killed $agpr4 +; GFX90A-NEXT: ; kill: def $agpr3 killed $agpr3 +; GFX90A-NEXT: ; kill: def $agpr2 killed $agpr2 +; GFX90A-NEXT: ; kill: def $agpr1 killed $agpr1 +; GFX90A-NEXT: ; kill: def $agpr0 killed $agpr0 ; GFX90A-NEXT: s_add_i32 s0, s0, -1 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] ; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 ; GFX90A-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX90A-NEXT: ; %bb.2: ; %exit +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -1759,11 +1911,11 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dword s1, s[4:5], 0x2c ; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a31, 0 +; GFX942-NEXT: s_mov_b32 s0, 16 +; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 +; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX942-NEXT: v_accvgpr_write_b32 a30, 0 -; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NEXT: v_accvgpr_write_b32 a29, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a28, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a27, 0 @@ -1792,18 +1944,46 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa ; GFX942-NEXT: v_accvgpr_write_b32 a4, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a3, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a2, 0 -; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX942-NEXT: .LBB6_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31] +; GFX942-NEXT: ; kill: def $agpr30 killed $agpr30 +; GFX942-NEXT: ; kill: def $agpr29 killed $agpr29 +; GFX942-NEXT: ; kill: def $agpr28 killed $agpr28 +; GFX942-NEXT: ; kill: def $agpr27 killed $agpr27 +; GFX942-NEXT: ; kill: def $agpr26 killed $agpr26 +; GFX942-NEXT: ; kill: def $agpr25 killed $agpr25 +; GFX942-NEXT: ; kill: def $agpr24 killed $agpr24 +; GFX942-NEXT: ; kill: def $agpr23 killed $agpr23 +; GFX942-NEXT: ; kill: def $agpr22 killed $agpr22 +; GFX942-NEXT: ; kill: def $agpr21 killed $agpr21 +; GFX942-NEXT: ; kill: def $agpr20 killed $agpr20 +; GFX942-NEXT: ; kill: def $agpr19 killed $agpr19 +; GFX942-NEXT: ; kill: def $agpr18 killed $agpr18 +; GFX942-NEXT: ; kill: def $agpr17 killed $agpr17 +; GFX942-NEXT: ; kill: def $agpr16 killed $agpr16 +; GFX942-NEXT: ; kill: def $agpr15 killed $agpr15 +; GFX942-NEXT: ; kill: def $agpr14 killed $agpr14 +; GFX942-NEXT: ; kill: def $agpr13 killed $agpr13 +; GFX942-NEXT: ; kill: def $agpr12 killed $agpr12 +; GFX942-NEXT: ; kill: def $agpr11 killed $agpr11 +; GFX942-NEXT: ; kill: def $agpr10 killed $agpr10 +; GFX942-NEXT: ; kill: def $agpr9 killed $agpr9 +; GFX942-NEXT: ; kill: def $agpr8 killed $agpr8 +; GFX942-NEXT: ; kill: def $agpr7 killed $agpr7 +; GFX942-NEXT: ; kill: def $agpr6 killed $agpr6 +; GFX942-NEXT: ; kill: def $agpr5 killed $agpr5 +; GFX942-NEXT: ; kill: def $agpr4 killed $agpr4 +; GFX942-NEXT: ; kill: def $agpr3 killed $agpr3 +; GFX942-NEXT: ; kill: def $agpr2 killed $agpr2 +; GFX942-NEXT: ; kill: def $agpr1 killed $agpr1 +; GFX942-NEXT: ; kill: def $agpr0 killed $agpr0 ; GFX942-NEXT: s_add_i32 s0, s0, -1 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31] ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 ; GFX942-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX942-NEXT: ; %bb.2: ; %exit +; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) @@ -2152,42 +2332,43 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 { ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 2 -; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a7, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a8, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a9, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a10, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a11, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a12, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a13, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a14, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a15, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a16, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a17, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a18, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a19, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a20, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a21, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a22, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a23, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a24, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a25, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a26, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a27, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a28, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a29, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a30, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a31, a0 ; GFX90A-NEXT: .LBB8_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GFX90A-NEXT: ; kill: def $agpr30 killed $agpr30 +; GFX90A-NEXT: ; kill: def $agpr29 killed $agpr29 +; GFX90A-NEXT: ; kill: def $agpr28 killed $agpr28 +; GFX90A-NEXT: ; kill: def $agpr27 killed $agpr27 +; GFX90A-NEXT: ; kill: def $agpr26 killed $agpr26 +; GFX90A-NEXT: ; kill: def $agpr25 killed $agpr25 +; GFX90A-NEXT: ; kill: def $agpr24 killed $agpr24 +; GFX90A-NEXT: ; kill: def $agpr23 killed $agpr23 +; GFX90A-NEXT: ; kill: def $agpr22 killed $agpr22 +; GFX90A-NEXT: ; kill: def $agpr21 killed $agpr21 +; GFX90A-NEXT: ; kill: def $agpr20 killed $agpr20 +; GFX90A-NEXT: ; kill: def $agpr19 killed $agpr19 +; GFX90A-NEXT: ; kill: def $agpr18 killed $agpr18 +; GFX90A-NEXT: ; kill: def $agpr17 killed $agpr17 +; GFX90A-NEXT: ; kill: def $agpr16 killed $agpr16 +; GFX90A-NEXT: ; kill: def $agpr15 killed $agpr15 +; GFX90A-NEXT: ; kill: def $agpr14 killed $agpr14 +; GFX90A-NEXT: ; kill: def $agpr13 killed $agpr13 +; GFX90A-NEXT: ; kill: def $agpr12 killed $agpr12 +; GFX90A-NEXT: ; kill: def $agpr11 killed $agpr11 +; GFX90A-NEXT: ; kill: def $agpr10 killed $agpr10 +; GFX90A-NEXT: ; kill: def $agpr9 killed $agpr9 +; GFX90A-NEXT: ; kill: def $agpr8 killed $agpr8 +; GFX90A-NEXT: ; kill: def $agpr7 killed $agpr7 +; GFX90A-NEXT: ; kill: def $agpr6 killed $agpr6 +; GFX90A-NEXT: ; kill: def $agpr5 killed $agpr5 +; GFX90A-NEXT: ; kill: def $agpr4 killed $agpr4 +; GFX90A-NEXT: ; kill: def $agpr3 killed $agpr3 +; GFX90A-NEXT: ; kill: def $agpr2 killed $agpr2 +; GFX90A-NEXT: ; kill: def $agpr1 killed $agpr1 +; GFX90A-NEXT: ; kill: def $agpr0 killed $agpr0 ; GFX90A-NEXT: s_add_i32 s0, s0, -1 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 ; GFX90A-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %exit @@ -2195,7 +2376,7 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 { ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 4 +; GFX90A-NEXT: s_nop 5 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -2216,42 +2397,43 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 { ; GFX942-NEXT: s_nop 7 ; GFX942-NEXT: s_nop 7 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a4, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a5, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a6, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a7, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a8, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a9, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a10, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a11, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a12, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a13, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a14, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a15, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a16, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a17, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a18, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a19, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a20, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a21, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a22, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a23, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a24, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a25, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a26, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a27, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a28, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a29, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a30, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a31, a0 ; GFX942-NEXT: .LBB8_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31] +; GFX942-NEXT: ; kill: def $agpr30 killed $agpr30 +; GFX942-NEXT: ; kill: def $agpr29 killed $agpr29 +; GFX942-NEXT: ; kill: def $agpr28 killed $agpr28 +; GFX942-NEXT: ; kill: def $agpr27 killed $agpr27 +; GFX942-NEXT: ; kill: def $agpr26 killed $agpr26 +; GFX942-NEXT: ; kill: def $agpr25 killed $agpr25 +; GFX942-NEXT: ; kill: def $agpr24 killed $agpr24 +; GFX942-NEXT: ; kill: def $agpr23 killed $agpr23 +; GFX942-NEXT: ; kill: def $agpr22 killed $agpr22 +; GFX942-NEXT: ; kill: def $agpr21 killed $agpr21 +; GFX942-NEXT: ; kill: def $agpr20 killed $agpr20 +; GFX942-NEXT: ; kill: def $agpr19 killed $agpr19 +; GFX942-NEXT: ; kill: def $agpr18 killed $agpr18 +; GFX942-NEXT: ; kill: def $agpr17 killed $agpr17 +; GFX942-NEXT: ; kill: def $agpr16 killed $agpr16 +; GFX942-NEXT: ; kill: def $agpr15 killed $agpr15 +; GFX942-NEXT: ; kill: def $agpr14 killed $agpr14 +; GFX942-NEXT: ; kill: def $agpr13 killed $agpr13 +; GFX942-NEXT: ; kill: def $agpr12 killed $agpr12 +; GFX942-NEXT: ; kill: def $agpr11 killed $agpr11 +; GFX942-NEXT: ; kill: def $agpr10 killed $agpr10 +; GFX942-NEXT: ; kill: def $agpr9 killed $agpr9 +; GFX942-NEXT: ; kill: def $agpr8 killed $agpr8 +; GFX942-NEXT: ; kill: def $agpr7 killed $agpr7 +; GFX942-NEXT: ; kill: def $agpr6 killed $agpr6 +; GFX942-NEXT: ; kill: def $agpr5 killed $agpr5 +; GFX942-NEXT: ; kill: def $agpr4 killed $agpr4 +; GFX942-NEXT: ; kill: def $agpr3 killed $agpr3 +; GFX942-NEXT: ; kill: def $agpr2 killed $agpr2 +; GFX942-NEXT: ; kill: def $agpr1 killed $agpr1 +; GFX942-NEXT: ; kill: def $agpr0 killed $agpr0 ; GFX942-NEXT: s_add_i32 s0, s0, -1 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31] ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 ; GFX942-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX942-NEXT: ; %bb.2: ; %exit @@ -2259,7 +2441,7 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 { ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 3 +; GFX942-NEXT: s_nop 4 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll index ea6449b99b516..aaca68f3d606f 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll @@ -31,6 +31,7 @@ ; DEFAULT-NEXT: Live Register Matrix ; DEFAULT-NEXT: Greedy Register Allocator ; DEFAULT-NEXT: GCN NSA Reassign +; DEFAULT-NEXT: AMDGPU Eliminate AGPR-to-VGPR Copy ; DEFAULT-NEXT: AMDGPU Rewrite AGPR-Copy-MFMA ; DEFAULT-NEXT: Virtual Register Rewriter ; DEFAULT-NEXT: AMDGPU Mark Last Scratch Load @@ -78,6 +79,7 @@ ; BASIC-DEFAULT-NEXT: Live Register Matrix ; BASIC-DEFAULT-NEXT: Greedy Register Allocator ; BASIC-DEFAULT-NEXT: GCN NSA Reassign +; BASIC-DEFAULT-NEXT: AMDGPU Eliminate AGPR-to-VGPR Copy ; BASIC-DEFAULT-NEXT: AMDGPU Rewrite AGPR-Copy-MFMA ; BASIC-DEFAULT-NEXT: Virtual Register Rewriter ; BASIC-DEFAULT-NEXT: AMDGPU Mark Last Scratch Load @@ -101,6 +103,7 @@ ; DEFAULT-BASIC-NEXT: Live Register Matrix ; DEFAULT-BASIC-NEXT: Basic Register Allocator ; DEFAULT-BASIC-NEXT: GCN NSA Reassign +; DEFAULT-BASIC-NEXT: AMDGPU Eliminate AGPR-to-VGPR Copy ; DEFAULT-BASIC-NEXT: AMDGPU Rewrite AGPR-Copy-MFMA ; DEFAULT-BASIC-NEXT: Virtual Register Rewriter ; DEFAULT-BASIC-NEXT: AMDGPU Mark Last Scratch Load @@ -130,6 +133,7 @@ ; BASIC-BASIC-NEXT: Live Register Matrix ; BASIC-BASIC-NEXT: Basic Register Allocator ; BASIC-BASIC-NEXT: GCN NSA Reassign +; BASIC-BASIC-NEXT: AMDGPU Eliminate AGPR-to-VGPR Copy ; BASIC-BASIC-NEXT: AMDGPU Rewrite AGPR-Copy-MFMA ; BASIC-BASIC-NEXT: Virtual Register Rewriter ; BASIC-BASIC-NEXT: AMDGPU Mark Last Scratch Load diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll index eb0d5465cacd9..b43c19f44614c 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll @@ -77,26 +77,18 @@ define amdgpu_kernel void @max_12regs_13a_used(i32 %cond, ptr addrspace(1) %arg, ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_endpgm ; GFX90A-NEXT: .LBB0_2: ; %use -; GFX90A-NEXT: s_nop 3 -; GFX90A-NEXT: v_accvgpr_read_b32 v9, a7 -; GFX90A-NEXT: v_accvgpr_read_b32 v8, a6 -; GFX90A-NEXT: v_accvgpr_read_b32 v7, a5 -; GFX90A-NEXT: v_accvgpr_read_b32 v6, a4 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: s_nop 2 ; GFX90A-NEXT: v_accvgpr_write_b32 a4, 4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a8, 5 ; GFX90A-NEXT: v_accvgpr_write_b32 a9, 1 ; GFX90A-NEXT: v_accvgpr_write_b32 a10, 2 ; GFX90A-NEXT: v_accvgpr_write_b32 a11, 3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_write_b32 a4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, v7 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, v8 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, v9 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -155,26 +147,14 @@ define amdgpu_kernel void @max_10_vgprs_used_9a() #1 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a3 -; GFX90A-NEXT: v_accvgpr_read_b32 v4, a2 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_endpgm @@ -331,7 +311,6 @@ define amdgpu_kernel void @max_32regs_mfma32(ptr addrspace(1) %arg) #3 { ; GFX90A-NEXT: v_accvgpr_write_b32 a1, 2.0 ; GFX90A-NEXT: v_accvgpr_write_b32 a29, v2 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41f80000 -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a0 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, 1.0 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, 4.0 ; GFX90A-NEXT: v_accvgpr_write_b32 a30, v2 @@ -339,13 +318,12 @@ define amdgpu_kernel void @max_32regs_mfma32(ptr addrspace(1) %arg) #3 { ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v1, a[0:31] -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: global_store_dword v0, a0, s[2:3] ; GFX90A-NEXT: s_endpgm bb: diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn index 3d11ce566207a..d907aee996390 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn @@ -141,6 +141,7 @@ static_library("LLVMAMDGPUCodeGen") { "AMDGPUCodeGenPrepare.cpp", "AMDGPUCombinerHelper.cpp", "AMDGPUCtorDtorLowering.cpp", + "AMDGPUEliminateAGPRToVGPRCopy.cpp", "AMDGPUExportClustering.cpp", "AMDGPUExportKernelRuntimeHandles.cpp", "AMDGPUFrameLowering.cpp",