From e799dcab03d05a20b16eb5d2f45621ec5b9750fe Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Tue, 8 Apr 2025 16:51:38 -0700 Subject: [PATCH 1/6] [AMDGPU] Teach iterative schedulers about IGLP Change-Id: Iee536f6c3238c59304ebe814c56eafa2219ff408 --- llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 9 + llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h | 5 + .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 9 +- .../Target/AMDGPU/GCNIterativeScheduler.cpp | 48 +- .../lib/Target/AMDGPU/GCNIterativeScheduler.h | 3 + llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 10 +- llvm/test/CodeGen/AMDGPU/iglp.opt.reentry.ll | 2 + ...vm.amdgcn.sched.group.barrier.iterative.ll | 933 ++++++++++++++++++ .../AMDGPU/schedule-regpressure-limit2.ll | 8 +- 9 files changed, 1006 insertions(+), 21 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp index 7b4d00c8214cb..c45ee8e28c68f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -2696,6 +2696,15 @@ bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) { } // namespace +namespace llvm { + +namespace AMDGPU { +bool isIGLPMutationOnly(unsigned Opcode) { + return Opcode == AMDGPU::SCHED_GROUP_BARRIER || Opcode == AMDGPU::IGLP_OPT; +} + +} // end namespace AMDGPU + /// \p Phase specifes whether or not this is a reentry into the /// IGroupLPDAGMutation. Since there may be multiple scheduling passes on the /// same scheduling region (e.g. pre and post-RA scheduling / multiple diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h index aff7096f26d67..b7e8c711c6fcc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h @@ -18,6 +18,11 @@ namespace llvm { namespace AMDGPU { // The current phase of instruction scheduling enum class SchedulingPhase { Initial, PreRAReentry, PostRA }; + +// Return true if the instruction is mutually exclusive with all non-IGLP DAG +// mutations, requiring all other mutations to be disabled. +bool isIGLPMutationOnly(unsigned Opcode); + } // namespace AMDGPU std::unique_ptr diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index e06d5a08e06a5..87c47998669b2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -605,6 +605,7 @@ createGCNMaxMemoryClauseMachineScheduler(MachineSchedContext *C) { if (ST.shouldClusterStores()) DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); + DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial)); return DAG; } @@ -616,12 +617,15 @@ createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); if (ST.shouldClusterStores()) DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); + DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial)); return DAG; } static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) { - return new GCNIterativeScheduler(C, - GCNIterativeScheduler::SCHEDULE_MINREGFORCED); + auto *DAG = new GCNIterativeScheduler( + C, GCNIterativeScheduler::SCHEDULE_MINREGFORCED); + DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial)); + return DAG; } static ScheduleDAGInstrs * @@ -632,6 +636,7 @@ createIterativeILPMachineScheduler(MachineSchedContext *C) { if (ST.shouldClusterStores()) DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); + DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial)); return DAG; } diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp index da065e8d8cb6b..68e07f007e0f1 100644 --- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp +++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "GCNIterativeScheduler.h" +#include "AMDGPUIGroupLP.h" #include "GCNSchedStrategy.h" #include "SIMachineFunctionInfo.h" @@ -118,6 +119,25 @@ void GCNIterativeScheduler::printSchedRP(raw_ostream &OS, } #endif +void GCNIterativeScheduler::swapIGLPMutations(const Region &R, bool IsReentry) { + bool HasIGLPInstrs = false; + + for (MachineBasicBlock::iterator I = R.Begin; I != R.End; I++) { + if (AMDGPU::isIGLPMutationOnly(I->getOpcode())) { + HasIGLPInstrs = true; + break; + } + } + + if (HasIGLPInstrs) { + SavedMutations.clear(); + SavedMutations.swap(Mutations); + auto SchedPhase = IsReentry ? AMDGPU::SchedulingPhase::PreRAReentry + : AMDGPU::SchedulingPhase::Initial; + addMutation(createIGroupLPDAGMutation(SchedPhase)); + } +} + // DAG builder helper class GCNIterativeScheduler::BuildDAG { GCNIterativeScheduler &Sch; @@ -125,14 +145,15 @@ class GCNIterativeScheduler::BuildDAG { SmallVector BotRoots; public: - BuildDAG(const Region &R, GCNIterativeScheduler &_Sch) - : Sch(_Sch) { + BuildDAG(const Region &R, GCNIterativeScheduler &_Sch, bool IsReentry = false) + : Sch(_Sch) { auto *BB = R.Begin->getParent(); Sch.BaseClass::startBlock(BB); Sch.BaseClass::enterRegion(BB, R.Begin, R.End, R.NumRegionInstrs); - + Sch.swapIGLPMutations(R, IsReentry); Sch.buildSchedGraph(Sch.AA, nullptr, nullptr, nullptr, /*TrackLaneMask*/true); + Sch.postProcessDAG(); Sch.Topo.InitDAGTopologicalSorting(); Sch.findRootsAndBiasEdges(TopRoots, BotRoots); } @@ -432,13 +453,15 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) { auto NewOcc = TargetOcc; for (auto *R : Regions) { + // Always build the DAG to add mutations + BuildDAG DAG(*R, *this); + if (R->MaxPressure.getOccupancy(ST) >= NewOcc) - break; + continue; LLVM_DEBUG(printRegion(dbgs(), R->Begin, R->End, LIS, 3); printLivenessInfo(dbgs(), R->Begin, R->End, LIS)); - BuildDAG DAG(*R, *this); const auto MinSchedule = makeMinRegSchedule(DAG.getTopRoots(), *this); const auto MaxRP = getSchedulePressure(*R, MinSchedule); LLVM_DEBUG(dbgs() << "Occupancy improvement attempt:\n"; @@ -469,8 +492,11 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy( sortRegionsByPressure(TgtOcc); auto Occ = Regions.front()->MaxPressure.getOccupancy(ST); - if (TryMaximizeOccupancy && Occ < TgtOcc) + bool IsReentry = false; + if (TryMaximizeOccupancy && Occ < TgtOcc) { Occ = tryMaximizeOccupancy(TgtOcc); + IsReentry = true; + } // This is really weird but for some magic scheduling regions twice // gives performance improvement @@ -489,7 +515,8 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy( LStrgy.setTargetOccupancy(I == 0 ? 0 : TgtOcc); for (auto *R : Regions) { OverrideLegacyStrategy Ovr(*R, LStrgy, *this); - + IsReentry |= I > 0; + swapIGLPMutations(*R, IsReentry); Ovr.schedule(); const auto RP = getRegionPressure(*R); LLVM_DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP)); @@ -556,8 +583,11 @@ void GCNIterativeScheduler::scheduleILP( sortRegionsByPressure(TgtOcc); auto Occ = Regions.front()->MaxPressure.getOccupancy(ST); - if (TryMaximizeOccupancy && Occ < TgtOcc) + bool IsReentry = false; + if (TryMaximizeOccupancy && Occ < TgtOcc) { Occ = tryMaximizeOccupancy(TgtOcc); + IsReentry = true; + } TgtOcc = std::min(Occ, TgtOcc); LLVM_DEBUG(dbgs() << "Scheduling using default scheduler, " @@ -566,7 +596,7 @@ void GCNIterativeScheduler::scheduleILP( unsigned FinalOccupancy = std::min(Occ, MFI->getOccupancy()); for (auto *R : Regions) { - BuildDAG DAG(*R, *this); + BuildDAG DAG(*R, *this, IsReentry); const auto ILPSchedule = makeGCNILPScheduler(DAG.getBottomRoots(), *this); const auto RP = getSchedulePressure(*R, ILPSchedule); diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h index c0228540b7a2f..f731b1fc7e0df 100644 --- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h +++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h @@ -77,6 +77,8 @@ class GCNIterativeScheduler : public ScheduleDAGMILive { const StrategyKind Strategy; mutable GCNUpwardRPTracker UPTracker; + std::vector> SavedMutations; + class BuildDAG; class OverrideLegacyStrategy; @@ -91,6 +93,7 @@ class GCNIterativeScheduler : public ScheduleDAGMILive { return getRegionPressure(R.Begin, R.End); } + void swapIGLPMutations(const Region &R, bool IsReentry); void setBestSchedule(Region &R, ScheduleRef Schedule, const GCNRegPressure &MaxRP = GCNRegPressure()); diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index ea9bc88bbe86b..5e5d06a40aad4 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -188,12 +188,6 @@ static void getRegisterPressures( Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum(); } -// Return true if the instruction is mutually exclusive with all non-IGLP DAG -// mutations, requiring all other mutations to be disabled. -static bool isIGLPMutationOnly(unsigned Opcode) { - return Opcode == AMDGPU::SCHED_GROUP_BARRIER || Opcode == AMDGPU::IGLP_OPT; -} - void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop, const RegPressureTracker &RPTracker, @@ -1163,7 +1157,7 @@ bool GCNSchedStage::initGCNRegion() { StageID == GCNSchedStageID::ILPInitialSchedule) { for (auto &I : DAG) { Unsched.push_back(&I); - if (isIGLPMutationOnly(I.getOpcode())) + if (AMDGPU::isIGLPMutationOnly(I.getOpcode())) DAG.RegionsWithIGLPInstrs[RegionIdx] = true; } } else { @@ -2048,7 +2042,7 @@ void GCNScheduleDAGMILive::updateRegionBoundaries( static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) { return any_of(*DAG, [](MachineBasicBlock::iterator MI) { - return isIGLPMutationOnly(MI->getOpcode()); + return AMDGPU::isIGLPMutationOnly(MI->getOpcode()); }); } diff --git a/llvm/test/CodeGen/AMDGPU/iglp.opt.reentry.ll b/llvm/test/CodeGen/AMDGPU/iglp.opt.reentry.ll index 1113acb3c0305..ba1cb9b26dec6 100644 --- a/llvm/test/CodeGen/AMDGPU/iglp.opt.reentry.ll +++ b/llvm/test/CodeGen/AMDGPU/iglp.opt.reentry.ll @@ -1,4 +1,6 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -O3 < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -O3 -misched=gcn-iterative-max-occupancy-experimental < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -O3 -misched=gcn-iterative-ilp < %s | FileCheck %s ; Test should not result in build failure ; CHECK-LABEL: shouldNotReApply diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll new file mode 100644 index 0000000000000..0764cd5d34d75 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll @@ -0,0 +1,933 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -misched-cluster=0 -misched=gcn-iterative-minreg < %s | FileCheck -check-prefix=GCN-MINREG %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -misched-cluster=0 -misched=gcn-iterative-max-occupancy-experimental < %s | FileCheck -check-prefix=GCN-MAXOCC %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -misched-cluster=0 -misched=gcn-iterative-ilp < %s | FileCheck -check-prefix=GCN-ILP %s + +define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { +; GCN-MINREG-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: +; GCN-MINREG: ; %bb.0: ; %entry +; GCN-MINREG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-MINREG-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-MINREG-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 +; GCN-MINREG-NEXT: v_mov_b32_e32 v2, 1.0 +; GCN-MINREG-NEXT: v_mov_b32_e32 v1, 2.0 +; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MINREG-NEXT: v_add_u32_e32 v4, s0, v0 +; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v4 offset:112 +; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v4 offset:96 +; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v4 offset:80 +; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v4 offset:64 +; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v4 +; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v4 offset:16 +; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v4 offset:32 +; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v4 offset:48 +; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] +; GCN-MINREG-NEXT: v_add_u32_e32 v5, s1, v0 +; GCN-MINREG-NEXT: v_mov_b32_e32 v0, s1 +; GCN-MINREG-NEXT: v_add_u32_e32 v3, 0x6000, v4 +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: ds_write_b128 v5, a[28:31] offset:112 +; GCN-MINREG-NEXT: ds_write_b128 v5, a[24:27] offset:96 +; GCN-MINREG-NEXT: ds_write_b128 v5, a[20:23] offset:80 +; GCN-MINREG-NEXT: ds_write_b128 v5, a[16:19] offset:64 +; GCN-MINREG-NEXT: ds_write_b128 v5, a[12:15] offset:48 +; GCN-MINREG-NEXT: ds_write_b128 v5, a[8:11] offset:32 +; GCN-MINREG-NEXT: ds_write_b128 v5, a[4:7] offset:16 +; GCN-MINREG-NEXT: ds_write_b128 v5, a[0:3] +; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v4 offset:8304 +; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v4 offset:8288 +; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v4 offset:8272 +; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v4 offset:8256 +; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v4 offset:8240 +; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v4 offset:8224 +; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v4 offset:8208 +; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v4 offset:8192 +; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 2 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[24:27] offset:8288 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[28:31] offset:8304 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[16:19] offset:8256 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[20:23] offset:8272 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[8:11] offset:8224 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[12:15] offset:8240 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[0:3] offset:8192 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[4:7] offset:8208 +; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v4 offset:24688 +; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v4 offset:24672 +; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v4 offset:24656 +; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v4 offset:24640 +; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v4 offset:24624 +; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v4 offset:24608 +; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v4 offset:24592 +; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v4 offset:24576 +; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 2 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[24:27] offset:16480 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[28:31] offset:16496 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[16:19] offset:16448 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[20:23] offset:16464 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[8:11] offset:16416 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[12:15] offset:16432 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[0:3] offset:16384 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[4:7] offset:16400 +; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v4 offset:49264 +; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v4 offset:49248 +; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v4 offset:49232 +; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v4 offset:49216 +; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v4 offset:49200 +; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v4 offset:49184 +; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v4 offset:49168 +; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v4 offset:49152 +; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 2 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[24:27] offset:24672 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[28:31] offset:24688 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[16:19] offset:24640 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[20:23] offset:24656 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[8:11] offset:24608 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[12:15] offset:24624 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[0:3] offset:24576 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[4:7] offset:24592 +; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v3 offset:57456 +; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v3 offset:57440 +; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v3 offset:57424 +; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v3 offset:57408 +; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v3 offset:57344 +; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v3 offset:57360 +; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v3 offset:57376 +; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v3 offset:57392 +; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 2 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[24:27] offset:32864 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[28:31] offset:32880 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[16:19] offset:32832 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[20:23] offset:32848 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[8:11] offset:32800 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[12:15] offset:32816 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[0:3] offset:32768 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[4:7] offset:32784 +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MINREG-NEXT: s_endpgm +; +; GCN-MAXOCC-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: +; GCN-MAXOCC: ; %bb.0: ; %entry +; GCN-MAXOCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-MAXOCC-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-MAXOCC-NEXT: v_and_b32_e32 v1, 0x1ff80, v0 +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v2, 1.0 +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v3, 2.0 +; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MAXOCC-NEXT: v_add_u32_e32 v0, s0, v1 +; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:112 +; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:96 +; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:80 +; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:64 +; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 +; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:16 +; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:32 +; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:48 +; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] +; GCN-MAXOCC-NEXT: v_add_u32_e32 v1, s1, v1 +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 1 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:112 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:96 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[20:23] offset:80 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[16:19] offset:64 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[12:15] offset:48 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[8:11] offset:32 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[4:7] offset:16 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[0:3] +; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:8304 +; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:8288 +; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:8272 +; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:8256 +; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:8240 +; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:8224 +; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:8208 +; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:8192 +; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v1, s1 +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 1 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:8288 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:8304 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[16:19] offset:8256 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[20:23] offset:8272 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[8:11] offset:8224 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[12:15] offset:8240 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[0:3] offset:8192 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[4:7] offset:8208 +; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:24688 +; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:24672 +; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:24656 +; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:24640 +; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:24624 +; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:24608 +; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:24592 +; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:24576 +; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 2 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:16480 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:16496 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[16:19] offset:16448 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[20:23] offset:16464 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[8:11] offset:16416 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[12:15] offset:16432 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[0:3] offset:16384 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[4:7] offset:16400 +; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:49264 +; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:49248 +; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:49232 +; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:49216 +; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:49200 +; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:49184 +; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:49168 +; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:49152 +; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] +; GCN-MAXOCC-NEXT: v_add_u32_e32 v0, 0x6000, v0 +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 1 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:24672 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:24688 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[16:19] offset:24640 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[20:23] offset:24656 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[8:11] offset:24608 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[12:15] offset:24624 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[0:3] offset:24576 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[4:7] offset:24592 +; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:57456 +; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:57440 +; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:57424 +; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:57408 +; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:57344 +; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:57360 +; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:57376 +; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:57392 +; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 2 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:32864 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:32880 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[16:19] offset:32832 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[20:23] offset:32848 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[8:11] offset:32800 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[12:15] offset:32816 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[0:3] offset:32768 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[4:7] offset:32784 +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: s_endpgm +; +; GCN-ILP-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: +; GCN-ILP: ; %bb.0: ; %entry +; GCN-ILP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-ILP-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-ILP-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 +; GCN-ILP-NEXT: v_mov_b32_e32 v1, 1.0 +; GCN-ILP-NEXT: v_mov_b32_e32 v2, 2.0 +; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ILP-NEXT: v_add_u32_e32 v3, s0, v0 +; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:48 +; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:32 +; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:16 +; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 +; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:64 +; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:80 +; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:96 +; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:112 +; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; GCN-ILP-NEXT: v_add_u32_e32 v0, s1, v0 +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 1 +; GCN-ILP-NEXT: ds_write_b128 v0, a[28:31] offset:112 +; GCN-ILP-NEXT: ds_write_b128 v0, a[24:27] offset:96 +; GCN-ILP-NEXT: ds_write_b128 v0, a[20:23] offset:80 +; GCN-ILP-NEXT: ds_write_b128 v0, a[16:19] offset:64 +; GCN-ILP-NEXT: ds_write_b128 v0, a[12:15] offset:48 +; GCN-ILP-NEXT: ds_write_b128 v0, a[8:11] offset:32 +; GCN-ILP-NEXT: ds_write_b128 v0, a[4:7] offset:16 +; GCN-ILP-NEXT: ds_write_b128 v0, a[0:3] +; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:8192 +; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:8208 +; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:8224 +; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:8240 +; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:8256 +; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:8272 +; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:8288 +; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:8304 +; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; GCN-ILP-NEXT: v_mov_b32_e32 v0, s1 +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 1 +; GCN-ILP-NEXT: ds_write_b128 v0, a[24:27] offset:8288 +; GCN-ILP-NEXT: ds_write_b128 v0, a[28:31] offset:8304 +; GCN-ILP-NEXT: ds_write_b128 v0, a[16:19] offset:8256 +; GCN-ILP-NEXT: ds_write_b128 v0, a[20:23] offset:8272 +; GCN-ILP-NEXT: ds_write_b128 v0, a[8:11] offset:8224 +; GCN-ILP-NEXT: ds_write_b128 v0, a[12:15] offset:8240 +; GCN-ILP-NEXT: ds_write_b128 v0, a[0:3] offset:8192 +; GCN-ILP-NEXT: ds_write_b128 v0, a[4:7] offset:8208 +; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:24576 +; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:24592 +; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:24608 +; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:24624 +; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:24640 +; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:24656 +; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:24672 +; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:24688 +; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 2 +; GCN-ILP-NEXT: ds_write_b128 v0, a[4:7] offset:16400 +; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:49168 +; GCN-ILP-NEXT: ds_write_b128 v0, a[0:3] offset:16384 +; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:49152 +; GCN-ILP-NEXT: ds_write_b128 v0, a[12:15] offset:16432 +; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:49200 +; GCN-ILP-NEXT: ds_write_b128 v0, a[8:11] offset:16416 +; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:49184 +; GCN-ILP-NEXT: ds_write_b128 v0, a[20:23] offset:16464 +; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:49232 +; GCN-ILP-NEXT: ds_write_b128 v0, a[16:19] offset:16448 +; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:49216 +; GCN-ILP-NEXT: ds_write_b128 v0, a[28:31] offset:16496 +; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:49264 +; GCN-ILP-NEXT: ds_write_b128 v0, a[24:27] offset:16480 +; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:49248 +; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; GCN-ILP-NEXT: v_add_u32_e32 v3, 0x6000, v3 +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 1 +; GCN-ILP-NEXT: ds_write_b128 v0, a[4:7] offset:24592 +; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:57360 +; GCN-ILP-NEXT: ds_write_b128 v0, a[0:3] offset:24576 +; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:57344 +; GCN-ILP-NEXT: ds_write_b128 v0, a[12:15] offset:24624 +; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:57392 +; GCN-ILP-NEXT: ds_write_b128 v0, a[8:11] offset:24608 +; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:57376 +; GCN-ILP-NEXT: ds_write_b128 v0, a[20:23] offset:24656 +; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:57424 +; GCN-ILP-NEXT: ds_write_b128 v0, a[16:19] offset:24640 +; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:57408 +; GCN-ILP-NEXT: ds_write_b128 v0, a[28:31] offset:24688 +; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:57456 +; GCN-ILP-NEXT: ds_write_b128 v0, a[24:27] offset:24672 +; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:57440 +; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 2 +; GCN-ILP-NEXT: ds_write_b128 v0, a[24:27] offset:32864 +; GCN-ILP-NEXT: ds_write_b128 v0, a[28:31] offset:32880 +; GCN-ILP-NEXT: ds_write_b128 v0, a[16:19] offset:32832 +; GCN-ILP-NEXT: ds_write_b128 v0, a[20:23] offset:32848 +; GCN-ILP-NEXT: ds_write_b128 v0, a[8:11] offset:32800 +; GCN-ILP-NEXT: ds_write_b128 v0, a[12:15] offset:32816 +; GCN-ILP-NEXT: ds_write_b128 v0, a[0:3] offset:32768 +; GCN-ILP-NEXT: ds_write_b128 v0, a[4:7] offset:32784 +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-ILP-NEXT: s_endpgm +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %load.0.addr = getelementptr <32 x float>, ptr addrspace(3) %in, i32 %idx + %load.0 = load <32 x float>, ptr addrspace(3) %load.0.addr + %load.1.addr = getelementptr <32 x float>, ptr addrspace(3) %load.0.addr, i32 64 + %load.1 = load <32 x float>, ptr addrspace(3) %load.1.addr + %load.2.addr = getelementptr <32 x float>, ptr addrspace(3) %load.1.addr, i32 128 + %load.2 = load <32 x float>, ptr addrspace(3) %load.2.addr + %load.3.addr = getelementptr <32 x float>, ptr addrspace(3) %load.2.addr, i32 192 + %load.3 = load <32 x float>, ptr addrspace(3) %load.3.addr + %load.4.addr = getelementptr <32 x float>, ptr addrspace(3) %load.3.addr, i32 256 + %load.4 = load <32 x float>, ptr addrspace(3) %load.4.addr + %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.0, i32 0, i32 0, i32 0) + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.1, i32 0, i32 0, i32 0) + %mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.2, i32 0, i32 0, i32 0) + %mai.3 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.3, i32 0, i32 0, i32 0) + %mai.4 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.4, i32 0, i32 0, i32 0) + %store.0.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 %idx + store <32 x float> %mai.0, ptr addrspace(3) %store.0.addr + %store.1.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 64 + store <32 x float> %mai.1, ptr addrspace(3) %store.1.addr + %store.2.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 128 + store <32 x float> %mai.2, ptr addrspace(3) %store.2.addr + %store.3.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 192 + store <32 x float> %mai.3, ptr addrspace(3) %store.3.addr + %store.4.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 256 + store <32 x float> %mai.4, ptr addrspace(3) %store.4.addr + ; 8 DS read + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 8, i32 0) + ; 1 MFMA + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + ; 8 DS write + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 8, i32 0) + ; 8 DS read + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 8, i32 0) + ; 1 MFMA + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + ; 8 DS write + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 8, i32 0) + ; 8 DS read + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 8, i32 0) + ; 1 MFMA + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + ; 8 DS write + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 8, i32 0) + ; 8 DS read + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 8, i32 0) + ; 1 MFMA + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + ; 8 DS write + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 8, i32 0) + ; 8 DS read + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 8, i32 0) + ; 1 MFMA + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + ; 8 DS write + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 8, i32 0) + ret void +} + + +define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_split_region(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { +; GCN-MINREG-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave_split_region: +; GCN-MINREG: ; %bb.0: ; %entry +; GCN-MINREG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-MINREG-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-MINREG-NEXT: v_and_b32_e32 v2, 0x1ff80, v0 +; GCN-MINREG-NEXT: v_mov_b32_e32 v1, 1.0 +; GCN-MINREG-NEXT: v_mov_b32_e32 v0, 2.0 +; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MINREG-NEXT: v_add_u32_e32 v3, s0, v2 +; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v3 offset:112 +; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v3 offset:96 +; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v3 offset:80 +; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v3 offset:64 +; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v3 +; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v3 offset:16 +; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v3 offset:32 +; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v3 offset:48 +; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GCN-MINREG-NEXT: v_add_u32_e32 v2, s1, v2 +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 1 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:112 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:96 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[20:23] offset:80 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[16:19] offset:64 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[12:15] offset:48 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[8:11] offset:32 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[4:7] offset:16 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[0:3] +; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v3 offset:8304 +; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v3 offset:8288 +; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v3 offset:8272 +; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v3 offset:8256 +; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v3 offset:8240 +; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v3 offset:8224 +; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v3 offset:8208 +; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v3 offset:8192 +; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GCN-MINREG-NEXT: v_mov_b32_e32 v2, s1 +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 1 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:8288 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:8304 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[16:19] offset:8256 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[20:23] offset:8272 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[8:11] offset:8224 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[12:15] offset:8240 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[0:3] offset:8192 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[4:7] offset:8208 +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_barrier mask(0x00000000) +; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v3 offset:24688 +; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v3 offset:24672 +; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v3 offset:24656 +; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v3 offset:24640 +; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v3 offset:24576 +; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v3 offset:24592 +; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v3 offset:24608 +; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v3 offset:24624 +; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GCN-MINREG-NEXT: v_add_u32_e32 v4, 0x6000, v3 +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 1 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:16496 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:16480 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[20:23] offset:16464 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[16:19] offset:16448 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[12:15] offset:16432 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[8:11] offset:16416 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[4:7] offset:16400 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[0:3] offset:16384 +; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v3 offset:49264 +; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v3 offset:49248 +; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v3 offset:49232 +; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v3 offset:49216 +; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v3 offset:49200 +; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v3 offset:49184 +; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v3 offset:49168 +; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v3 offset:49152 +; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 2 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:24688 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:24672 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[20:23] offset:24656 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[16:19] offset:24640 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[12:15] offset:24624 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[8:11] offset:24608 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[4:7] offset:24592 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[0:3] offset:24576 +; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v4 offset:57456 +; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v4 offset:57440 +; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v4 offset:57424 +; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v4 offset:57408 +; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v4 offset:57344 +; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v4 offset:57360 +; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v4 offset:57376 +; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v4 offset:57392 +; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 2 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:32880 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:32864 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[20:23] offset:32848 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[16:19] offset:32832 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[12:15] offset:32816 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[8:11] offset:32800 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[4:7] offset:32784 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[0:3] offset:32768 +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MINREG-NEXT: s_endpgm +; +; GCN-MAXOCC-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave_split_region: +; GCN-MAXOCC: ; %bb.0: ; %entry +; GCN-MAXOCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-MAXOCC-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-MAXOCC-NEXT: v_and_b32_e32 v3, 0x1ff80, v0 +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v1, 1.0 +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v2, 2.0 +; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MAXOCC-NEXT: v_add_u32_e32 v0, s0, v3 +; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:112 +; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:96 +; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:80 +; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:64 +; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 +; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:16 +; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:32 +; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:48 +; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; GCN-MAXOCC-NEXT: v_add_u32_e32 v3, s1, v3 +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 1 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:112 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:96 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[20:23] offset:80 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[16:19] offset:64 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[12:15] offset:48 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[8:11] offset:32 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[4:7] offset:16 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[0:3] +; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:8304 +; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:8288 +; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:8272 +; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:8256 +; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:8240 +; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:8224 +; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:8208 +; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:8192 +; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v3, s1 +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 1 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:8288 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:8304 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[16:19] offset:8256 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[20:23] offset:8272 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[8:11] offset:8224 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[12:15] offset:8240 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[0:3] offset:8192 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[4:7] offset:8208 +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_barrier mask(0x00000000) +; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:24688 +; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:24672 +; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:24656 +; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:24640 +; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:24576 +; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:24592 +; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:24608 +; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:24624 +; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 2 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:16496 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:16480 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[20:23] offset:16464 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[16:19] offset:16448 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[12:15] offset:16432 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[8:11] offset:16416 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[4:7] offset:16400 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[0:3] offset:16384 +; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:49264 +; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:49248 +; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:49232 +; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:49216 +; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:49200 +; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:49184 +; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:49168 +; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:49152 +; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; GCN-MAXOCC-NEXT: v_add_u32_e32 v0, 0x6000, v0 +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 1 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:24688 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:24672 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[20:23] offset:24656 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[16:19] offset:24640 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[12:15] offset:24624 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[8:11] offset:24608 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[4:7] offset:24592 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[0:3] offset:24576 +; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:57456 +; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:57440 +; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:57424 +; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:57408 +; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:57344 +; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:57360 +; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:57376 +; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:57392 +; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 2 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:32880 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:32864 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[20:23] offset:32848 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[16:19] offset:32832 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[12:15] offset:32816 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[8:11] offset:32800 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[4:7] offset:32784 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[0:3] offset:32768 +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: s_endpgm +; +; GCN-ILP-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave_split_region: +; GCN-ILP: ; %bb.0: ; %entry +; GCN-ILP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-ILP-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-ILP-NEXT: v_and_b32_e32 v2, 0x1ff80, v0 +; GCN-ILP-NEXT: v_mov_b32_e32 v0, 1.0 +; GCN-ILP-NEXT: v_mov_b32_e32 v1, 2.0 +; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ILP-NEXT: v_add_u32_e32 v3, s0, v2 +; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:48 +; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:32 +; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:16 +; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 +; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:64 +; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:80 +; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:96 +; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:112 +; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GCN-ILP-NEXT: v_add_u32_e32 v2, s1, v2 +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 1 +; GCN-ILP-NEXT: ds_write_b128 v2, a[0:3] +; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:8192 +; GCN-ILP-NEXT: ds_write_b128 v2, a[4:7] offset:16 +; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:8208 +; GCN-ILP-NEXT: ds_write_b128 v2, a[8:11] offset:32 +; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:8224 +; GCN-ILP-NEXT: ds_write_b128 v2, a[12:15] offset:48 +; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:8240 +; GCN-ILP-NEXT: ds_write_b128 v2, a[16:19] offset:64 +; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:8256 +; GCN-ILP-NEXT: ds_write_b128 v2, a[20:23] offset:80 +; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:8272 +; GCN-ILP-NEXT: ds_write_b128 v2, a[24:27] offset:96 +; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:8288 +; GCN-ILP-NEXT: ds_write_b128 v2, a[28:31] offset:112 +; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:8304 +; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GCN-ILP-NEXT: v_mov_b32_e32 v2, s1 +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 1 +; GCN-ILP-NEXT: ds_write_b128 v2, a[24:27] offset:8288 +; GCN-ILP-NEXT: ds_write_b128 v2, a[28:31] offset:8304 +; GCN-ILP-NEXT: ds_write_b128 v2, a[16:19] offset:8256 +; GCN-ILP-NEXT: ds_write_b128 v2, a[20:23] offset:8272 +; GCN-ILP-NEXT: ds_write_b128 v2, a[8:11] offset:8224 +; GCN-ILP-NEXT: ds_write_b128 v2, a[12:15] offset:8240 +; GCN-ILP-NEXT: ds_write_b128 v2, a[0:3] offset:8192 +; GCN-ILP-NEXT: ds_write_b128 v2, a[4:7] offset:8208 +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_barrier mask(0x00000000) +; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:24624 +; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:24608 +; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:24592 +; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:24576 +; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:24640 +; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:24656 +; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:24672 +; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:24688 +; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 2 +; GCN-ILP-NEXT: ds_write_b128 v2, a[28:31] offset:16496 +; GCN-ILP-NEXT: ds_write_b128 v2, a[24:27] offset:16480 +; GCN-ILP-NEXT: ds_write_b128 v2, a[20:23] offset:16464 +; GCN-ILP-NEXT: ds_write_b128 v2, a[16:19] offset:16448 +; GCN-ILP-NEXT: ds_write_b128 v2, a[12:15] offset:16432 +; GCN-ILP-NEXT: ds_write_b128 v2, a[8:11] offset:16416 +; GCN-ILP-NEXT: ds_write_b128 v2, a[4:7] offset:16400 +; GCN-ILP-NEXT: ds_write_b128 v2, a[0:3] offset:16384 +; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:49152 +; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:49168 +; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:49184 +; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:49200 +; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:49216 +; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:49232 +; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:49248 +; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:49264 +; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GCN-ILP-NEXT: v_add_u32_e32 v3, 0x6000, v3 +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 1 +; GCN-ILP-NEXT: ds_write_b128 v2, a[0:3] offset:24576 +; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:57344 +; GCN-ILP-NEXT: ds_write_b128 v2, a[4:7] offset:24592 +; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:57360 +; GCN-ILP-NEXT: ds_write_b128 v2, a[8:11] offset:24608 +; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:57376 +; GCN-ILP-NEXT: ds_write_b128 v2, a[12:15] offset:24624 +; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:57392 +; GCN-ILP-NEXT: ds_write_b128 v2, a[16:19] offset:24640 +; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:57408 +; GCN-ILP-NEXT: ds_write_b128 v2, a[20:23] offset:24656 +; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:57424 +; GCN-ILP-NEXT: ds_write_b128 v2, a[24:27] offset:24672 +; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:57440 +; GCN-ILP-NEXT: ds_write_b128 v2, a[28:31] offset:24688 +; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:57456 +; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 2 +; GCN-ILP-NEXT: ds_write_b128 v2, a[28:31] offset:32880 +; GCN-ILP-NEXT: ds_write_b128 v2, a[24:27] offset:32864 +; GCN-ILP-NEXT: ds_write_b128 v2, a[20:23] offset:32848 +; GCN-ILP-NEXT: ds_write_b128 v2, a[16:19] offset:32832 +; GCN-ILP-NEXT: ds_write_b128 v2, a[12:15] offset:32816 +; GCN-ILP-NEXT: ds_write_b128 v2, a[8:11] offset:32800 +; GCN-ILP-NEXT: ds_write_b128 v2, a[4:7] offset:32784 +; GCN-ILP-NEXT: ds_write_b128 v2, a[0:3] offset:32768 +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-ILP-NEXT: s_endpgm +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %load.0.addr = getelementptr <32 x float>, ptr addrspace(3) %in, i32 %idx + %load.0 = load <32 x float>, ptr addrspace(3) %load.0.addr + %load.1.addr = getelementptr <32 x float>, ptr addrspace(3) %load.0.addr, i32 64 + %load.1 = load <32 x float>, ptr addrspace(3) %load.1.addr + %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.0, i32 0, i32 0, i32 0) + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.1, i32 0, i32 0, i32 0) + %store.0.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 %idx + store <32 x float> %mai.0, ptr addrspace(3) %store.0.addr + %store.1.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 64 + store <32 x float> %mai.1, ptr addrspace(3) %store.1.addr + ; 8 DS read + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 8, i32 0) + ; 1 MFMA + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + ; 8 DS write + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 8, i32 0) + ; 8 DS read + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 8, i32 0) + ; 1 MFMA + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + ; 8 DS write + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 8, i32 0) + ; parition the regions + call void @llvm.amdgcn.sched.barrier(i32 0) + %load.2.addr = getelementptr <32 x float>, ptr addrspace(3) %load.1.addr, i32 128 + %load.2 = load <32 x float>, ptr addrspace(3) %load.2.addr + %load.3.addr = getelementptr <32 x float>, ptr addrspace(3) %load.2.addr, i32 192 + %load.3 = load <32 x float>, ptr addrspace(3) %load.3.addr + %load.4.addr = getelementptr <32 x float>, ptr addrspace(3) %load.3.addr, i32 256 + %load.4 = load <32 x float>, ptr addrspace(3) %load.4.addr + %mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.2, i32 0, i32 0, i32 0) + %mai.3 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.3, i32 0, i32 0, i32 0) + %mai.4 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.4, i32 0, i32 0, i32 0) + %store.2.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 128 + store <32 x float> %mai.2, ptr addrspace(3) %store.2.addr + %store.3.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 192 + store <32 x float> %mai.3, ptr addrspace(3) %store.3.addr + %store.4.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 256 + store <32 x float> %mai.4, ptr addrspace(3) %store.4.addr + ; 8 DS read + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 8, i32 0) + ; 1 MFMA + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + ; 8 DS write + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 8, i32 0) + ; 8 DS read + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 8, i32 0) + ; 1 MFMA + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + ; 8 DS write + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 8, i32 0) + ; 8 DS read + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 8, i32 0) + ; 1 MFMA + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + ; 8 DS write + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 8, i32 0) + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll index 7d771342a598e..ea28334fccf5a 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll @@ -14,8 +14,12 @@ ; SI-MAXOCC: NumVgprs: {{[1-4]?[0-9]$}} ; stores may alias loads -; VI: NumSgprs: {{[0-9]$}} -; VI: NumVgprs: {{[1-3][0-9]$}} +; VI-MINREG: NumSgprs: {{[0-9]$}} +; VI-MINREG: NumVgprs: {{[1-3][0-9]$}} + +; stores may alias loads +; VI-MAXOCC: NumSgprs: {{[1-3][0-9]$}} +; VI-MAXOCC: NumVgprs: {{[1-6][0-9]$}} define amdgpu_kernel void @load_fma_store(ptr addrspace(3) nocapture readonly %in_arg, ptr addrspace(1) nocapture %out_arg) { bb: From 8edf064133578a329d220a2b8c5324820847edbf Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Tue, 8 Apr 2025 17:19:36 -0700 Subject: [PATCH 2/6] Remove from MaxMemoryClause Scheduler Change-Id: Ibf08571f46c2a378c6e1f5c968128571a5938367 --- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 87c47998669b2..8f3138acaea0d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -605,7 +605,6 @@ createGCNMaxMemoryClauseMachineScheduler(MachineSchedContext *C) { if (ST.shouldClusterStores()) DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); - DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial)); return DAG; } From 135d8b655e408481eeac02aee1bc283dde45f2aa Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Fri, 11 Apr 2025 12:58:33 -0700 Subject: [PATCH 3/6] Move isIGLPMutationOnly to SIInstrInfo Change-Id: If3650ce24a1a047557b3e40363b72aefd909e873 --- llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 7 ------- llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h | 4 ---- llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp | 5 +++-- llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 8 +++++--- llvm/lib/Target/AMDGPU/SIInstrInfo.h | 6 ++++++ 5 files changed, 14 insertions(+), 16 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp index c45ee8e28c68f..34d65aa105686 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -2698,13 +2698,6 @@ bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) { namespace llvm { -namespace AMDGPU { -bool isIGLPMutationOnly(unsigned Opcode) { - return Opcode == AMDGPU::SCHED_GROUP_BARRIER || Opcode == AMDGPU::IGLP_OPT; -} - -} // end namespace AMDGPU - /// \p Phase specifes whether or not this is a reentry into the /// IGroupLPDAGMutation. Since there may be multiple scheduling passes on the /// same scheduling region (e.g. pre and post-RA scheduling / multiple diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h index b7e8c711c6fcc..7f1d1e54dd07d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h @@ -19,10 +19,6 @@ namespace AMDGPU { // The current phase of instruction scheduling enum class SchedulingPhase { Initial, PreRAReentry, PostRA }; -// Return true if the instruction is mutually exclusive with all non-IGLP DAG -// mutations, requiring all other mutations to be disabled. -bool isIGLPMutationOnly(unsigned Opcode); - } // namespace AMDGPU std::unique_ptr diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp index 68e07f007e0f1..19cdfc01c02c4 100644 --- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp +++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp @@ -121,9 +121,9 @@ void GCNIterativeScheduler::printSchedRP(raw_ostream &OS, void GCNIterativeScheduler::swapIGLPMutations(const Region &R, bool IsReentry) { bool HasIGLPInstrs = false; - + const SIInstrInfo *SII = static_cast(TII); for (MachineBasicBlock::iterator I = R.Begin; I != R.End; I++) { - if (AMDGPU::isIGLPMutationOnly(I->getOpcode())) { + if (SII->isIGLPMutationOnly(I->getOpcode())) { HasIGLPInstrs = true; break; } @@ -134,6 +134,7 @@ void GCNIterativeScheduler::swapIGLPMutations(const Region &R, bool IsReentry) { SavedMutations.swap(Mutations); auto SchedPhase = IsReentry ? AMDGPU::SchedulingPhase::PreRAReentry : AMDGPU::SchedulingPhase::Initial; + addMutation(createIGroupLPDAGMutation(SchedPhase)); } } diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 5e5d06a40aad4..5678512748569 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -1155,9 +1155,10 @@ bool GCNSchedStage::initGCNRegion() { Unsched.reserve(DAG.NumRegionInstrs); if (StageID == GCNSchedStageID::OccInitialSchedule || StageID == GCNSchedStageID::ILPInitialSchedule) { + const SIInstrInfo *SII = static_cast(DAG.TII); for (auto &I : DAG) { Unsched.push_back(&I); - if (AMDGPU::isIGLPMutationOnly(I.getOpcode())) + if (SII->isIGLPMutationOnly(I.getOpcode())) DAG.RegionsWithIGLPInstrs[RegionIdx] = true; } } else { @@ -2041,8 +2042,9 @@ void GCNScheduleDAGMILive::updateRegionBoundaries( } static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) { - return any_of(*DAG, [](MachineBasicBlock::iterator MI) { - return AMDGPU::isIGLPMutationOnly(MI->getOpcode()); + const SIInstrInfo *SII = static_cast(DAG->TII); + return any_of(*DAG, [SII](MachineBasicBlock::iterator MI) { + return SII->isIGLPMutationOnly(MI->getOpcode()); }); } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index d63225c067c9d..9eb9444d0fe96 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -985,6 +985,12 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { bool isIGLP(const MachineInstr &MI) const { return isIGLP(MI.getOpcode()); } + // Return true if the instruction is mutually exclusive with all non-IGLP DAG + // mutations, requiring all other mutations to be disabled. + bool isIGLPMutationOnly(unsigned Opcode) const { + return Opcode == AMDGPU::SCHED_GROUP_BARRIER || Opcode == AMDGPU::IGLP_OPT; + } + static unsigned getNonSoftWaitcntOpcode(unsigned Opcode) { switch (Opcode) { case AMDGPU::S_WAITCNT_soft: From bfcc7caebbf79d5bddea2124f7acc35bc39cab51 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Fri, 11 Apr 2025 13:21:02 -0700 Subject: [PATCH 4/6] Merge conflicts Change-Id: I963052236780781fd0ae56ce970c1f6179bdb904 --- llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 4 +++- llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll | 8 ++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp index 34d65aa105686..1f5f1a9cfb784 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -2704,6 +2704,8 @@ namespace llvm { /// scheduling "phases"), we can reenter this mutation framework more than once /// for a given region. std::unique_ptr -llvm::createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) { +createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) { return std::make_unique(Phase); } + +} diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll index ea28334fccf5a..462ac23ec7e0e 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll @@ -2,10 +2,10 @@ ; RUN: llc -mtriple=amdgcn -mcpu=tahiti -enable-amdgpu-aa=0 -misched=gcn-iterative-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck --check-prefix=SI-MAXOCC %s ; RUN: llc -mtriple=amdgcn -mcpu=tahiti -enable-amdgpu-aa=0 -amdgpu-sched-strategy=iterative-minreg -verify-machineinstrs < %s | FileCheck --check-prefix=SI-MINREG %s ; RUN: llc -mtriple=amdgcn -mcpu=tahiti -enable-amdgpu-aa=0 -amdgpu-sched-strategy=iterative-maxocc -verify-machineinstrs < %s | FileCheck --check-prefix=SI-MAXOCC %s -; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -misched=gcn-iterative-minreg -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -misched=gcn-iterative-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -amdgpu-sched-strategy=iterative-minreg -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -amdgpu-sched-strategy=iterative-maxocc -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -misched=gcn-iterative-minreg -verify-machineinstrs < %s | FileCheck --check-prefix=VI-MINREG %s +; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -misched=gcn-iterative-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck --check-prefix=VI-MAXOCC %s +; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -amdgpu-sched-strategy=iterative-minreg -verify-machineinstrs < %s | FileCheck --check-prefix=VI-MINREG %s +; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -amdgpu-sched-strategy=iterative-maxocc -verify-machineinstrs < %s | FileCheck --check-prefix=VI-MAXOCC %s ; SI-MINREG: NumSgprs: {{[1-9]$}} ; SI-MINREG: NumVgprs: {{[1-9]$}} From 17211ee04597a947d9a5a518326a67e82fec312d Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Fri, 11 Apr 2025 13:31:56 -0700 Subject: [PATCH 5/6] Remove namespace Change-Id: Ia66c6504e1edaef9bb8ce607c869deac721aff2f --- llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp index 1f5f1a9cfb784..7b4d00c8214cb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -2696,16 +2696,12 @@ bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) { } // namespace -namespace llvm { - /// \p Phase specifes whether or not this is a reentry into the /// IGroupLPDAGMutation. Since there may be multiple scheduling passes on the /// same scheduling region (e.g. pre and post-RA scheduling / multiple /// scheduling "phases"), we can reenter this mutation framework more than once /// for a given region. std::unique_ptr -createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) { +llvm::createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) { return std::make_unique(Phase); } - -} From 0807508bd09c3c617b71448646fc50e14bd54aa4 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Fri, 11 Apr 2025 15:32:10 -0700 Subject: [PATCH 6/6] Update llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h Remove newline Co-authored-by: Austin Kerbow --- llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h index 7f1d1e54dd07d..aff7096f26d67 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h @@ -18,7 +18,6 @@ namespace llvm { namespace AMDGPU { // The current phase of instruction scheduling enum class SchedulingPhase { Initial, PreRAReentry, PostRA }; - } // namespace AMDGPU std::unique_ptr