diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index e06d5a08e06a5..8f3138acaea0d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -616,12 +616,15 @@ createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); if (ST.shouldClusterStores()) DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); + DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial)); return DAG; } static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) { - return new GCNIterativeScheduler(C, - GCNIterativeScheduler::SCHEDULE_MINREGFORCED); + auto *DAG = new GCNIterativeScheduler( + C, GCNIterativeScheduler::SCHEDULE_MINREGFORCED); + DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial)); + return DAG; } static ScheduleDAGInstrs * @@ -632,6 +635,7 @@ createIterativeILPMachineScheduler(MachineSchedContext *C) { if (ST.shouldClusterStores()) DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); + DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial)); return DAG; } diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp index da065e8d8cb6b..19cdfc01c02c4 100644 --- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp +++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "GCNIterativeScheduler.h" +#include "AMDGPUIGroupLP.h" #include "GCNSchedStrategy.h" #include "SIMachineFunctionInfo.h" @@ -118,6 +119,26 @@ void GCNIterativeScheduler::printSchedRP(raw_ostream &OS, } #endif +void GCNIterativeScheduler::swapIGLPMutations(const Region &R, bool IsReentry) { + bool HasIGLPInstrs = false; + const SIInstrInfo *SII = static_cast(TII); + for (MachineBasicBlock::iterator I = R.Begin; I != R.End; I++) { + if (SII->isIGLPMutationOnly(I->getOpcode())) { + HasIGLPInstrs = true; + break; + } + } + + if (HasIGLPInstrs) { + SavedMutations.clear(); + SavedMutations.swap(Mutations); + auto SchedPhase = IsReentry ? AMDGPU::SchedulingPhase::PreRAReentry + : AMDGPU::SchedulingPhase::Initial; + + addMutation(createIGroupLPDAGMutation(SchedPhase)); + } +} + // DAG builder helper class GCNIterativeScheduler::BuildDAG { GCNIterativeScheduler &Sch; @@ -125,14 +146,15 @@ class GCNIterativeScheduler::BuildDAG { SmallVector BotRoots; public: - BuildDAG(const Region &R, GCNIterativeScheduler &_Sch) - : Sch(_Sch) { + BuildDAG(const Region &R, GCNIterativeScheduler &_Sch, bool IsReentry = false) + : Sch(_Sch) { auto *BB = R.Begin->getParent(); Sch.BaseClass::startBlock(BB); Sch.BaseClass::enterRegion(BB, R.Begin, R.End, R.NumRegionInstrs); - + Sch.swapIGLPMutations(R, IsReentry); Sch.buildSchedGraph(Sch.AA, nullptr, nullptr, nullptr, /*TrackLaneMask*/true); + Sch.postProcessDAG(); Sch.Topo.InitDAGTopologicalSorting(); Sch.findRootsAndBiasEdges(TopRoots, BotRoots); } @@ -432,13 +454,15 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) { auto NewOcc = TargetOcc; for (auto *R : Regions) { + // Always build the DAG to add mutations + BuildDAG DAG(*R, *this); + if (R->MaxPressure.getOccupancy(ST) >= NewOcc) - break; + continue; LLVM_DEBUG(printRegion(dbgs(), R->Begin, R->End, LIS, 3); printLivenessInfo(dbgs(), R->Begin, R->End, LIS)); - BuildDAG DAG(*R, *this); const auto MinSchedule = makeMinRegSchedule(DAG.getTopRoots(), *this); const auto MaxRP = getSchedulePressure(*R, MinSchedule); LLVM_DEBUG(dbgs() << "Occupancy improvement attempt:\n"; @@ -469,8 +493,11 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy( sortRegionsByPressure(TgtOcc); auto Occ = Regions.front()->MaxPressure.getOccupancy(ST); - if (TryMaximizeOccupancy && Occ < TgtOcc) + bool IsReentry = false; + if (TryMaximizeOccupancy && Occ < TgtOcc) { Occ = tryMaximizeOccupancy(TgtOcc); + IsReentry = true; + } // This is really weird but for some magic scheduling regions twice // gives performance improvement @@ -489,7 +516,8 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy( LStrgy.setTargetOccupancy(I == 0 ? 0 : TgtOcc); for (auto *R : Regions) { OverrideLegacyStrategy Ovr(*R, LStrgy, *this); - + IsReentry |= I > 0; + swapIGLPMutations(*R, IsReentry); Ovr.schedule(); const auto RP = getRegionPressure(*R); LLVM_DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP)); @@ -556,8 +584,11 @@ void GCNIterativeScheduler::scheduleILP( sortRegionsByPressure(TgtOcc); auto Occ = Regions.front()->MaxPressure.getOccupancy(ST); - if (TryMaximizeOccupancy && Occ < TgtOcc) + bool IsReentry = false; + if (TryMaximizeOccupancy && Occ < TgtOcc) { Occ = tryMaximizeOccupancy(TgtOcc); + IsReentry = true; + } TgtOcc = std::min(Occ, TgtOcc); LLVM_DEBUG(dbgs() << "Scheduling using default scheduler, " @@ -566,7 +597,7 @@ void GCNIterativeScheduler::scheduleILP( unsigned FinalOccupancy = std::min(Occ, MFI->getOccupancy()); for (auto *R : Regions) { - BuildDAG DAG(*R, *this); + BuildDAG DAG(*R, *this, IsReentry); const auto ILPSchedule = makeGCNILPScheduler(DAG.getBottomRoots(), *this); const auto RP = getSchedulePressure(*R, ILPSchedule); diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h index c0228540b7a2f..f731b1fc7e0df 100644 --- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h +++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h @@ -77,6 +77,8 @@ class GCNIterativeScheduler : public ScheduleDAGMILive { const StrategyKind Strategy; mutable GCNUpwardRPTracker UPTracker; + std::vector> SavedMutations; + class BuildDAG; class OverrideLegacyStrategy; @@ -91,6 +93,7 @@ class GCNIterativeScheduler : public ScheduleDAGMILive { return getRegionPressure(R.Begin, R.End); } + void swapIGLPMutations(const Region &R, bool IsReentry); void setBestSchedule(Region &R, ScheduleRef Schedule, const GCNRegPressure &MaxRP = GCNRegPressure()); diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index ea9bc88bbe86b..5678512748569 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -188,12 +188,6 @@ static void getRegisterPressures( Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum(); } -// Return true if the instruction is mutually exclusive with all non-IGLP DAG -// mutations, requiring all other mutations to be disabled. -static bool isIGLPMutationOnly(unsigned Opcode) { - return Opcode == AMDGPU::SCHED_GROUP_BARRIER || Opcode == AMDGPU::IGLP_OPT; -} - void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop, const RegPressureTracker &RPTracker, @@ -1161,9 +1155,10 @@ bool GCNSchedStage::initGCNRegion() { Unsched.reserve(DAG.NumRegionInstrs); if (StageID == GCNSchedStageID::OccInitialSchedule || StageID == GCNSchedStageID::ILPInitialSchedule) { + const SIInstrInfo *SII = static_cast(DAG.TII); for (auto &I : DAG) { Unsched.push_back(&I); - if (isIGLPMutationOnly(I.getOpcode())) + if (SII->isIGLPMutationOnly(I.getOpcode())) DAG.RegionsWithIGLPInstrs[RegionIdx] = true; } } else { @@ -2047,8 +2042,9 @@ void GCNScheduleDAGMILive::updateRegionBoundaries( } static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) { - return any_of(*DAG, [](MachineBasicBlock::iterator MI) { - return isIGLPMutationOnly(MI->getOpcode()); + const SIInstrInfo *SII = static_cast(DAG->TII); + return any_of(*DAG, [SII](MachineBasicBlock::iterator MI) { + return SII->isIGLPMutationOnly(MI->getOpcode()); }); } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index d63225c067c9d..9eb9444d0fe96 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -985,6 +985,12 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { bool isIGLP(const MachineInstr &MI) const { return isIGLP(MI.getOpcode()); } + // Return true if the instruction is mutually exclusive with all non-IGLP DAG + // mutations, requiring all other mutations to be disabled. + bool isIGLPMutationOnly(unsigned Opcode) const { + return Opcode == AMDGPU::SCHED_GROUP_BARRIER || Opcode == AMDGPU::IGLP_OPT; + } + static unsigned getNonSoftWaitcntOpcode(unsigned Opcode) { switch (Opcode) { case AMDGPU::S_WAITCNT_soft: diff --git a/llvm/test/CodeGen/AMDGPU/iglp.opt.reentry.ll b/llvm/test/CodeGen/AMDGPU/iglp.opt.reentry.ll index 1113acb3c0305..ba1cb9b26dec6 100644 --- a/llvm/test/CodeGen/AMDGPU/iglp.opt.reentry.ll +++ b/llvm/test/CodeGen/AMDGPU/iglp.opt.reentry.ll @@ -1,4 +1,6 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -O3 < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -O3 -misched=gcn-iterative-max-occupancy-experimental < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -O3 -misched=gcn-iterative-ilp < %s | FileCheck %s ; Test should not result in build failure ; CHECK-LABEL: shouldNotReApply diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll new file mode 100644 index 0000000000000..0764cd5d34d75 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll @@ -0,0 +1,933 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -misched-cluster=0 -misched=gcn-iterative-minreg < %s | FileCheck -check-prefix=GCN-MINREG %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -misched-cluster=0 -misched=gcn-iterative-max-occupancy-experimental < %s | FileCheck -check-prefix=GCN-MAXOCC %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -misched-cluster=0 -misched=gcn-iterative-ilp < %s | FileCheck -check-prefix=GCN-ILP %s + +define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { +; GCN-MINREG-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: +; GCN-MINREG: ; %bb.0: ; %entry +; GCN-MINREG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-MINREG-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-MINREG-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 +; GCN-MINREG-NEXT: v_mov_b32_e32 v2, 1.0 +; GCN-MINREG-NEXT: v_mov_b32_e32 v1, 2.0 +; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MINREG-NEXT: v_add_u32_e32 v4, s0, v0 +; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v4 offset:112 +; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v4 offset:96 +; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v4 offset:80 +; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v4 offset:64 +; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v4 +; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v4 offset:16 +; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v4 offset:32 +; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v4 offset:48 +; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] +; GCN-MINREG-NEXT: v_add_u32_e32 v5, s1, v0 +; GCN-MINREG-NEXT: v_mov_b32_e32 v0, s1 +; GCN-MINREG-NEXT: v_add_u32_e32 v3, 0x6000, v4 +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: ds_write_b128 v5, a[28:31] offset:112 +; GCN-MINREG-NEXT: ds_write_b128 v5, a[24:27] offset:96 +; GCN-MINREG-NEXT: ds_write_b128 v5, a[20:23] offset:80 +; GCN-MINREG-NEXT: ds_write_b128 v5, a[16:19] offset:64 +; GCN-MINREG-NEXT: ds_write_b128 v5, a[12:15] offset:48 +; GCN-MINREG-NEXT: ds_write_b128 v5, a[8:11] offset:32 +; GCN-MINREG-NEXT: ds_write_b128 v5, a[4:7] offset:16 +; GCN-MINREG-NEXT: ds_write_b128 v5, a[0:3] +; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v4 offset:8304 +; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v4 offset:8288 +; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v4 offset:8272 +; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v4 offset:8256 +; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v4 offset:8240 +; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v4 offset:8224 +; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v4 offset:8208 +; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v4 offset:8192 +; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 2 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[24:27] offset:8288 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[28:31] offset:8304 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[16:19] offset:8256 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[20:23] offset:8272 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[8:11] offset:8224 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[12:15] offset:8240 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[0:3] offset:8192 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[4:7] offset:8208 +; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v4 offset:24688 +; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v4 offset:24672 +; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v4 offset:24656 +; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v4 offset:24640 +; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v4 offset:24624 +; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v4 offset:24608 +; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v4 offset:24592 +; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v4 offset:24576 +; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 2 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[24:27] offset:16480 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[28:31] offset:16496 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[16:19] offset:16448 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[20:23] offset:16464 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[8:11] offset:16416 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[12:15] offset:16432 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[0:3] offset:16384 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[4:7] offset:16400 +; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v4 offset:49264 +; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v4 offset:49248 +; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v4 offset:49232 +; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v4 offset:49216 +; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v4 offset:49200 +; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v4 offset:49184 +; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v4 offset:49168 +; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v4 offset:49152 +; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 2 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[24:27] offset:24672 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[28:31] offset:24688 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[16:19] offset:24640 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[20:23] offset:24656 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[8:11] offset:24608 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[12:15] offset:24624 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[0:3] offset:24576 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[4:7] offset:24592 +; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v3 offset:57456 +; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v3 offset:57440 +; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v3 offset:57424 +; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v3 offset:57408 +; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v3 offset:57344 +; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v3 offset:57360 +; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v3 offset:57376 +; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v3 offset:57392 +; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 2 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[24:27] offset:32864 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[28:31] offset:32880 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[16:19] offset:32832 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[20:23] offset:32848 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[8:11] offset:32800 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[12:15] offset:32816 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[0:3] offset:32768 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[4:7] offset:32784 +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MINREG-NEXT: s_endpgm +; +; GCN-MAXOCC-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: +; GCN-MAXOCC: ; %bb.0: ; %entry +; GCN-MAXOCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-MAXOCC-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-MAXOCC-NEXT: v_and_b32_e32 v1, 0x1ff80, v0 +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v2, 1.0 +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v3, 2.0 +; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MAXOCC-NEXT: v_add_u32_e32 v0, s0, v1 +; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:112 +; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:96 +; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:80 +; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:64 +; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 +; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:16 +; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:32 +; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:48 +; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] +; GCN-MAXOCC-NEXT: v_add_u32_e32 v1, s1, v1 +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 1 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:112 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:96 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[20:23] offset:80 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[16:19] offset:64 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[12:15] offset:48 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[8:11] offset:32 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[4:7] offset:16 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[0:3] +; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:8304 +; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:8288 +; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:8272 +; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:8256 +; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:8240 +; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:8224 +; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:8208 +; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:8192 +; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v1, s1 +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 1 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:8288 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:8304 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[16:19] offset:8256 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[20:23] offset:8272 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[8:11] offset:8224 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[12:15] offset:8240 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[0:3] offset:8192 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[4:7] offset:8208 +; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:24688 +; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:24672 +; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:24656 +; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:24640 +; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:24624 +; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:24608 +; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:24592 +; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:24576 +; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 2 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:16480 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:16496 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[16:19] offset:16448 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[20:23] offset:16464 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[8:11] offset:16416 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[12:15] offset:16432 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[0:3] offset:16384 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[4:7] offset:16400 +; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:49264 +; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:49248 +; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:49232 +; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:49216 +; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:49200 +; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:49184 +; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:49168 +; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:49152 +; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] +; GCN-MAXOCC-NEXT: v_add_u32_e32 v0, 0x6000, v0 +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 1 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:24672 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:24688 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[16:19] offset:24640 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[20:23] offset:24656 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[8:11] offset:24608 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[12:15] offset:24624 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[0:3] offset:24576 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[4:7] offset:24592 +; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:57456 +; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:57440 +; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:57424 +; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:57408 +; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:57344 +; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:57360 +; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:57376 +; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:57392 +; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 2 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:32864 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:32880 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[16:19] offset:32832 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[20:23] offset:32848 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[8:11] offset:32800 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[12:15] offset:32816 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[0:3] offset:32768 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[4:7] offset:32784 +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: s_endpgm +; +; GCN-ILP-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: +; GCN-ILP: ; %bb.0: ; %entry +; GCN-ILP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-ILP-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-ILP-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 +; GCN-ILP-NEXT: v_mov_b32_e32 v1, 1.0 +; GCN-ILP-NEXT: v_mov_b32_e32 v2, 2.0 +; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ILP-NEXT: v_add_u32_e32 v3, s0, v0 +; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:48 +; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:32 +; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:16 +; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 +; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:64 +; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:80 +; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:96 +; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:112 +; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; GCN-ILP-NEXT: v_add_u32_e32 v0, s1, v0 +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 1 +; GCN-ILP-NEXT: ds_write_b128 v0, a[28:31] offset:112 +; GCN-ILP-NEXT: ds_write_b128 v0, a[24:27] offset:96 +; GCN-ILP-NEXT: ds_write_b128 v0, a[20:23] offset:80 +; GCN-ILP-NEXT: ds_write_b128 v0, a[16:19] offset:64 +; GCN-ILP-NEXT: ds_write_b128 v0, a[12:15] offset:48 +; GCN-ILP-NEXT: ds_write_b128 v0, a[8:11] offset:32 +; GCN-ILP-NEXT: ds_write_b128 v0, a[4:7] offset:16 +; GCN-ILP-NEXT: ds_write_b128 v0, a[0:3] +; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:8192 +; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:8208 +; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:8224 +; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:8240 +; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:8256 +; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:8272 +; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:8288 +; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:8304 +; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; GCN-ILP-NEXT: v_mov_b32_e32 v0, s1 +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 1 +; GCN-ILP-NEXT: ds_write_b128 v0, a[24:27] offset:8288 +; GCN-ILP-NEXT: ds_write_b128 v0, a[28:31] offset:8304 +; GCN-ILP-NEXT: ds_write_b128 v0, a[16:19] offset:8256 +; GCN-ILP-NEXT: ds_write_b128 v0, a[20:23] offset:8272 +; GCN-ILP-NEXT: ds_write_b128 v0, a[8:11] offset:8224 +; GCN-ILP-NEXT: ds_write_b128 v0, a[12:15] offset:8240 +; GCN-ILP-NEXT: ds_write_b128 v0, a[0:3] offset:8192 +; GCN-ILP-NEXT: ds_write_b128 v0, a[4:7] offset:8208 +; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:24576 +; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:24592 +; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:24608 +; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:24624 +; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:24640 +; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:24656 +; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:24672 +; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:24688 +; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 2 +; GCN-ILP-NEXT: ds_write_b128 v0, a[4:7] offset:16400 +; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:49168 +; GCN-ILP-NEXT: ds_write_b128 v0, a[0:3] offset:16384 +; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:49152 +; GCN-ILP-NEXT: ds_write_b128 v0, a[12:15] offset:16432 +; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:49200 +; GCN-ILP-NEXT: ds_write_b128 v0, a[8:11] offset:16416 +; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:49184 +; GCN-ILP-NEXT: ds_write_b128 v0, a[20:23] offset:16464 +; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:49232 +; GCN-ILP-NEXT: ds_write_b128 v0, a[16:19] offset:16448 +; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:49216 +; GCN-ILP-NEXT: ds_write_b128 v0, a[28:31] offset:16496 +; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:49264 +; GCN-ILP-NEXT: ds_write_b128 v0, a[24:27] offset:16480 +; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:49248 +; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; GCN-ILP-NEXT: v_add_u32_e32 v3, 0x6000, v3 +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 1 +; GCN-ILP-NEXT: ds_write_b128 v0, a[4:7] offset:24592 +; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:57360 +; GCN-ILP-NEXT: ds_write_b128 v0, a[0:3] offset:24576 +; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:57344 +; GCN-ILP-NEXT: ds_write_b128 v0, a[12:15] offset:24624 +; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:57392 +; GCN-ILP-NEXT: ds_write_b128 v0, a[8:11] offset:24608 +; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:57376 +; GCN-ILP-NEXT: ds_write_b128 v0, a[20:23] offset:24656 +; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:57424 +; GCN-ILP-NEXT: ds_write_b128 v0, a[16:19] offset:24640 +; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:57408 +; GCN-ILP-NEXT: ds_write_b128 v0, a[28:31] offset:24688 +; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:57456 +; GCN-ILP-NEXT: ds_write_b128 v0, a[24:27] offset:24672 +; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:57440 +; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 2 +; GCN-ILP-NEXT: ds_write_b128 v0, a[24:27] offset:32864 +; GCN-ILP-NEXT: ds_write_b128 v0, a[28:31] offset:32880 +; GCN-ILP-NEXT: ds_write_b128 v0, a[16:19] offset:32832 +; GCN-ILP-NEXT: ds_write_b128 v0, a[20:23] offset:32848 +; GCN-ILP-NEXT: ds_write_b128 v0, a[8:11] offset:32800 +; GCN-ILP-NEXT: ds_write_b128 v0, a[12:15] offset:32816 +; GCN-ILP-NEXT: ds_write_b128 v0, a[0:3] offset:32768 +; GCN-ILP-NEXT: ds_write_b128 v0, a[4:7] offset:32784 +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-ILP-NEXT: s_endpgm +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %load.0.addr = getelementptr <32 x float>, ptr addrspace(3) %in, i32 %idx + %load.0 = load <32 x float>, ptr addrspace(3) %load.0.addr + %load.1.addr = getelementptr <32 x float>, ptr addrspace(3) %load.0.addr, i32 64 + %load.1 = load <32 x float>, ptr addrspace(3) %load.1.addr + %load.2.addr = getelementptr <32 x float>, ptr addrspace(3) %load.1.addr, i32 128 + %load.2 = load <32 x float>, ptr addrspace(3) %load.2.addr + %load.3.addr = getelementptr <32 x float>, ptr addrspace(3) %load.2.addr, i32 192 + %load.3 = load <32 x float>, ptr addrspace(3) %load.3.addr + %load.4.addr = getelementptr <32 x float>, ptr addrspace(3) %load.3.addr, i32 256 + %load.4 = load <32 x float>, ptr addrspace(3) %load.4.addr + %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.0, i32 0, i32 0, i32 0) + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.1, i32 0, i32 0, i32 0) + %mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.2, i32 0, i32 0, i32 0) + %mai.3 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.3, i32 0, i32 0, i32 0) + %mai.4 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.4, i32 0, i32 0, i32 0) + %store.0.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 %idx + store <32 x float> %mai.0, ptr addrspace(3) %store.0.addr + %store.1.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 64 + store <32 x float> %mai.1, ptr addrspace(3) %store.1.addr + %store.2.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 128 + store <32 x float> %mai.2, ptr addrspace(3) %store.2.addr + %store.3.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 192 + store <32 x float> %mai.3, ptr addrspace(3) %store.3.addr + %store.4.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 256 + store <32 x float> %mai.4, ptr addrspace(3) %store.4.addr + ; 8 DS read + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 8, i32 0) + ; 1 MFMA + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + ; 8 DS write + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 8, i32 0) + ; 8 DS read + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 8, i32 0) + ; 1 MFMA + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + ; 8 DS write + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 8, i32 0) + ; 8 DS read + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 8, i32 0) + ; 1 MFMA + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + ; 8 DS write + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 8, i32 0) + ; 8 DS read + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 8, i32 0) + ; 1 MFMA + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + ; 8 DS write + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 8, i32 0) + ; 8 DS read + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 8, i32 0) + ; 1 MFMA + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + ; 8 DS write + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 8, i32 0) + ret void +} + + +define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_split_region(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { +; GCN-MINREG-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave_split_region: +; GCN-MINREG: ; %bb.0: ; %entry +; GCN-MINREG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-MINREG-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-MINREG-NEXT: v_and_b32_e32 v2, 0x1ff80, v0 +; GCN-MINREG-NEXT: v_mov_b32_e32 v1, 1.0 +; GCN-MINREG-NEXT: v_mov_b32_e32 v0, 2.0 +; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MINREG-NEXT: v_add_u32_e32 v3, s0, v2 +; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v3 offset:112 +; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v3 offset:96 +; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v3 offset:80 +; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v3 offset:64 +; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v3 +; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v3 offset:16 +; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v3 offset:32 +; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v3 offset:48 +; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GCN-MINREG-NEXT: v_add_u32_e32 v2, s1, v2 +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 1 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:112 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:96 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[20:23] offset:80 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[16:19] offset:64 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[12:15] offset:48 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[8:11] offset:32 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[4:7] offset:16 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[0:3] +; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v3 offset:8304 +; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v3 offset:8288 +; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v3 offset:8272 +; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v3 offset:8256 +; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v3 offset:8240 +; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v3 offset:8224 +; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v3 offset:8208 +; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v3 offset:8192 +; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GCN-MINREG-NEXT: v_mov_b32_e32 v2, s1 +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 1 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:8288 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:8304 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[16:19] offset:8256 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[20:23] offset:8272 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[8:11] offset:8224 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[12:15] offset:8240 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[0:3] offset:8192 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[4:7] offset:8208 +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_barrier mask(0x00000000) +; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v3 offset:24688 +; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v3 offset:24672 +; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v3 offset:24656 +; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v3 offset:24640 +; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v3 offset:24576 +; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v3 offset:24592 +; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v3 offset:24608 +; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v3 offset:24624 +; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GCN-MINREG-NEXT: v_add_u32_e32 v4, 0x6000, v3 +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 1 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:16496 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:16480 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[20:23] offset:16464 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[16:19] offset:16448 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[12:15] offset:16432 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[8:11] offset:16416 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[4:7] offset:16400 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[0:3] offset:16384 +; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v3 offset:49264 +; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v3 offset:49248 +; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v3 offset:49232 +; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v3 offset:49216 +; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v3 offset:49200 +; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v3 offset:49184 +; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v3 offset:49168 +; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v3 offset:49152 +; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 2 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:24688 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:24672 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[20:23] offset:24656 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[16:19] offset:24640 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[12:15] offset:24624 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[8:11] offset:24608 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[4:7] offset:24592 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[0:3] offset:24576 +; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v4 offset:57456 +; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v4 offset:57440 +; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v4 offset:57424 +; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v4 offset:57408 +; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v4 offset:57344 +; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v4 offset:57360 +; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v4 offset:57376 +; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v4 offset:57392 +; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 2 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:32880 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:32864 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[20:23] offset:32848 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[16:19] offset:32832 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[12:15] offset:32816 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[8:11] offset:32800 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[4:7] offset:32784 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[0:3] offset:32768 +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MINREG-NEXT: s_endpgm +; +; GCN-MAXOCC-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave_split_region: +; GCN-MAXOCC: ; %bb.0: ; %entry +; GCN-MAXOCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-MAXOCC-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-MAXOCC-NEXT: v_and_b32_e32 v3, 0x1ff80, v0 +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v1, 1.0 +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v2, 2.0 +; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MAXOCC-NEXT: v_add_u32_e32 v0, s0, v3 +; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:112 +; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:96 +; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:80 +; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:64 +; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 +; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:16 +; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:32 +; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:48 +; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; GCN-MAXOCC-NEXT: v_add_u32_e32 v3, s1, v3 +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 1 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:112 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:96 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[20:23] offset:80 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[16:19] offset:64 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[12:15] offset:48 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[8:11] offset:32 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[4:7] offset:16 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[0:3] +; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:8304 +; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:8288 +; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:8272 +; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:8256 +; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:8240 +; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:8224 +; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:8208 +; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:8192 +; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v3, s1 +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 1 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:8288 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:8304 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[16:19] offset:8256 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[20:23] offset:8272 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[8:11] offset:8224 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[12:15] offset:8240 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[0:3] offset:8192 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[4:7] offset:8208 +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_barrier mask(0x00000000) +; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:24688 +; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:24672 +; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:24656 +; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:24640 +; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:24576 +; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:24592 +; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:24608 +; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:24624 +; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 2 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:16496 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:16480 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[20:23] offset:16464 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[16:19] offset:16448 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[12:15] offset:16432 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[8:11] offset:16416 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[4:7] offset:16400 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[0:3] offset:16384 +; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:49264 +; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:49248 +; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:49232 +; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:49216 +; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:49200 +; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:49184 +; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:49168 +; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:49152 +; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; GCN-MAXOCC-NEXT: v_add_u32_e32 v0, 0x6000, v0 +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 1 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:24688 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:24672 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[20:23] offset:24656 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[16:19] offset:24640 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[12:15] offset:24624 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[8:11] offset:24608 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[4:7] offset:24592 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[0:3] offset:24576 +; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:57456 +; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:57440 +; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:57424 +; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:57408 +; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:57344 +; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:57360 +; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:57376 +; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:57392 +; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 2 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:32880 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:32864 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[20:23] offset:32848 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[16:19] offset:32832 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[12:15] offset:32816 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[8:11] offset:32800 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[4:7] offset:32784 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[0:3] offset:32768 +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: s_endpgm +; +; GCN-ILP-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave_split_region: +; GCN-ILP: ; %bb.0: ; %entry +; GCN-ILP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-ILP-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-ILP-NEXT: v_and_b32_e32 v2, 0x1ff80, v0 +; GCN-ILP-NEXT: v_mov_b32_e32 v0, 1.0 +; GCN-ILP-NEXT: v_mov_b32_e32 v1, 2.0 +; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ILP-NEXT: v_add_u32_e32 v3, s0, v2 +; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:48 +; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:32 +; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:16 +; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 +; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:64 +; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:80 +; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:96 +; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:112 +; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GCN-ILP-NEXT: v_add_u32_e32 v2, s1, v2 +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 1 +; GCN-ILP-NEXT: ds_write_b128 v2, a[0:3] +; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:8192 +; GCN-ILP-NEXT: ds_write_b128 v2, a[4:7] offset:16 +; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:8208 +; GCN-ILP-NEXT: ds_write_b128 v2, a[8:11] offset:32 +; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:8224 +; GCN-ILP-NEXT: ds_write_b128 v2, a[12:15] offset:48 +; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:8240 +; GCN-ILP-NEXT: ds_write_b128 v2, a[16:19] offset:64 +; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:8256 +; GCN-ILP-NEXT: ds_write_b128 v2, a[20:23] offset:80 +; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:8272 +; GCN-ILP-NEXT: ds_write_b128 v2, a[24:27] offset:96 +; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:8288 +; GCN-ILP-NEXT: ds_write_b128 v2, a[28:31] offset:112 +; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:8304 +; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GCN-ILP-NEXT: v_mov_b32_e32 v2, s1 +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 1 +; GCN-ILP-NEXT: ds_write_b128 v2, a[24:27] offset:8288 +; GCN-ILP-NEXT: ds_write_b128 v2, a[28:31] offset:8304 +; GCN-ILP-NEXT: ds_write_b128 v2, a[16:19] offset:8256 +; GCN-ILP-NEXT: ds_write_b128 v2, a[20:23] offset:8272 +; GCN-ILP-NEXT: ds_write_b128 v2, a[8:11] offset:8224 +; GCN-ILP-NEXT: ds_write_b128 v2, a[12:15] offset:8240 +; GCN-ILP-NEXT: ds_write_b128 v2, a[0:3] offset:8192 +; GCN-ILP-NEXT: ds_write_b128 v2, a[4:7] offset:8208 +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_barrier mask(0x00000000) +; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:24624 +; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:24608 +; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:24592 +; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:24576 +; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:24640 +; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:24656 +; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:24672 +; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:24688 +; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 2 +; GCN-ILP-NEXT: ds_write_b128 v2, a[28:31] offset:16496 +; GCN-ILP-NEXT: ds_write_b128 v2, a[24:27] offset:16480 +; GCN-ILP-NEXT: ds_write_b128 v2, a[20:23] offset:16464 +; GCN-ILP-NEXT: ds_write_b128 v2, a[16:19] offset:16448 +; GCN-ILP-NEXT: ds_write_b128 v2, a[12:15] offset:16432 +; GCN-ILP-NEXT: ds_write_b128 v2, a[8:11] offset:16416 +; GCN-ILP-NEXT: ds_write_b128 v2, a[4:7] offset:16400 +; GCN-ILP-NEXT: ds_write_b128 v2, a[0:3] offset:16384 +; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:49152 +; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:49168 +; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:49184 +; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:49200 +; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:49216 +; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:49232 +; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:49248 +; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:49264 +; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GCN-ILP-NEXT: v_add_u32_e32 v3, 0x6000, v3 +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 1 +; GCN-ILP-NEXT: ds_write_b128 v2, a[0:3] offset:24576 +; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:57344 +; GCN-ILP-NEXT: ds_write_b128 v2, a[4:7] offset:24592 +; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:57360 +; GCN-ILP-NEXT: ds_write_b128 v2, a[8:11] offset:24608 +; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:57376 +; GCN-ILP-NEXT: ds_write_b128 v2, a[12:15] offset:24624 +; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:57392 +; GCN-ILP-NEXT: ds_write_b128 v2, a[16:19] offset:24640 +; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:57408 +; GCN-ILP-NEXT: ds_write_b128 v2, a[20:23] offset:24656 +; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:57424 +; GCN-ILP-NEXT: ds_write_b128 v2, a[24:27] offset:24672 +; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:57440 +; GCN-ILP-NEXT: ds_write_b128 v2, a[28:31] offset:24688 +; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:57456 +; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 2 +; GCN-ILP-NEXT: ds_write_b128 v2, a[28:31] offset:32880 +; GCN-ILP-NEXT: ds_write_b128 v2, a[24:27] offset:32864 +; GCN-ILP-NEXT: ds_write_b128 v2, a[20:23] offset:32848 +; GCN-ILP-NEXT: ds_write_b128 v2, a[16:19] offset:32832 +; GCN-ILP-NEXT: ds_write_b128 v2, a[12:15] offset:32816 +; GCN-ILP-NEXT: ds_write_b128 v2, a[8:11] offset:32800 +; GCN-ILP-NEXT: ds_write_b128 v2, a[4:7] offset:32784 +; GCN-ILP-NEXT: ds_write_b128 v2, a[0:3] offset:32768 +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-ILP-NEXT: s_endpgm +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %load.0.addr = getelementptr <32 x float>, ptr addrspace(3) %in, i32 %idx + %load.0 = load <32 x float>, ptr addrspace(3) %load.0.addr + %load.1.addr = getelementptr <32 x float>, ptr addrspace(3) %load.0.addr, i32 64 + %load.1 = load <32 x float>, ptr addrspace(3) %load.1.addr + %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.0, i32 0, i32 0, i32 0) + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.1, i32 0, i32 0, i32 0) + %store.0.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 %idx + store <32 x float> %mai.0, ptr addrspace(3) %store.0.addr + %store.1.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 64 + store <32 x float> %mai.1, ptr addrspace(3) %store.1.addr + ; 8 DS read + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 8, i32 0) + ; 1 MFMA + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + ; 8 DS write + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 8, i32 0) + ; 8 DS read + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 8, i32 0) + ; 1 MFMA + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + ; 8 DS write + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 8, i32 0) + ; parition the regions + call void @llvm.amdgcn.sched.barrier(i32 0) + %load.2.addr = getelementptr <32 x float>, ptr addrspace(3) %load.1.addr, i32 128 + %load.2 = load <32 x float>, ptr addrspace(3) %load.2.addr + %load.3.addr = getelementptr <32 x float>, ptr addrspace(3) %load.2.addr, i32 192 + %load.3 = load <32 x float>, ptr addrspace(3) %load.3.addr + %load.4.addr = getelementptr <32 x float>, ptr addrspace(3) %load.3.addr, i32 256 + %load.4 = load <32 x float>, ptr addrspace(3) %load.4.addr + %mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.2, i32 0, i32 0, i32 0) + %mai.3 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.3, i32 0, i32 0, i32 0) + %mai.4 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.4, i32 0, i32 0, i32 0) + %store.2.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 128 + store <32 x float> %mai.2, ptr addrspace(3) %store.2.addr + %store.3.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 192 + store <32 x float> %mai.3, ptr addrspace(3) %store.3.addr + %store.4.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 256 + store <32 x float> %mai.4, ptr addrspace(3) %store.4.addr + ; 8 DS read + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 8, i32 0) + ; 1 MFMA + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + ; 8 DS write + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 8, i32 0) + ; 8 DS read + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 8, i32 0) + ; 1 MFMA + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + ; 8 DS write + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 8, i32 0) + ; 8 DS read + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 8, i32 0) + ; 1 MFMA + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + ; 8 DS write + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 8, i32 0) + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll index 7d771342a598e..462ac23ec7e0e 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll @@ -2,10 +2,10 @@ ; RUN: llc -mtriple=amdgcn -mcpu=tahiti -enable-amdgpu-aa=0 -misched=gcn-iterative-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck --check-prefix=SI-MAXOCC %s ; RUN: llc -mtriple=amdgcn -mcpu=tahiti -enable-amdgpu-aa=0 -amdgpu-sched-strategy=iterative-minreg -verify-machineinstrs < %s | FileCheck --check-prefix=SI-MINREG %s ; RUN: llc -mtriple=amdgcn -mcpu=tahiti -enable-amdgpu-aa=0 -amdgpu-sched-strategy=iterative-maxocc -verify-machineinstrs < %s | FileCheck --check-prefix=SI-MAXOCC %s -; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -misched=gcn-iterative-minreg -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -misched=gcn-iterative-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -amdgpu-sched-strategy=iterative-minreg -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -amdgpu-sched-strategy=iterative-maxocc -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -misched=gcn-iterative-minreg -verify-machineinstrs < %s | FileCheck --check-prefix=VI-MINREG %s +; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -misched=gcn-iterative-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck --check-prefix=VI-MAXOCC %s +; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -amdgpu-sched-strategy=iterative-minreg -verify-machineinstrs < %s | FileCheck --check-prefix=VI-MINREG %s +; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -amdgpu-sched-strategy=iterative-maxocc -verify-machineinstrs < %s | FileCheck --check-prefix=VI-MAXOCC %s ; SI-MINREG: NumSgprs: {{[1-9]$}} ; SI-MINREG: NumVgprs: {{[1-9]$}} @@ -14,8 +14,12 @@ ; SI-MAXOCC: NumVgprs: {{[1-4]?[0-9]$}} ; stores may alias loads -; VI: NumSgprs: {{[0-9]$}} -; VI: NumVgprs: {{[1-3][0-9]$}} +; VI-MINREG: NumSgprs: {{[0-9]$}} +; VI-MINREG: NumVgprs: {{[1-3][0-9]$}} + +; stores may alias loads +; VI-MAXOCC: NumSgprs: {{[1-3][0-9]$}} +; VI-MAXOCC: NumVgprs: {{[1-6][0-9]$}} define amdgpu_kernel void @load_fma_store(ptr addrspace(3) nocapture readonly %in_arg, ptr addrspace(1) nocapture %out_arg) { bb: