diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 58482ea69d0b0..9fbf9e5fe8eeb 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -69,6 +69,12 @@ static cl::opt GCNTrackers( cl::desc("Use the AMDGPU specific RPTrackers during scheduling"), cl::init(false)); +static cl::opt PendingQueueLimit( + "amdgpu-scheduler-pending-queue-limit", cl::Hidden, + cl::desc( + "Max (Available+Pending) size to inspect pending queue (0 disables)"), + cl::init(256)); + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) #define DUMP_MAX_REG_PRESSURE static cl::opt PrintMaxRPRegUsageBeforeScheduler( @@ -335,17 +341,52 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, } } +static bool shouldCheckPending(SchedBoundary &Zone, + const TargetSchedModel *SchedModel) { + bool HasBufferedModel = + SchedModel->hasInstrSchedModel() && SchedModel->getMicroOpBufferSize(); + unsigned Combined = Zone.Available.size() + Zone.Pending.size(); + return Combined <= PendingQueueLimit && HasBufferedModel; +} + +static SUnit *pickOnlyChoice(SchedBoundary &Zone, + const TargetSchedModel *SchedModel) { + // pickOnlyChoice() releases pending instructions and checks for new hazards. + SUnit *OnlyChoice = Zone.pickOnlyChoice(); + if (!shouldCheckPending(Zone, SchedModel) || Zone.Pending.empty()) + return OnlyChoice; + + return nullptr; +} + +void GCNSchedStrategy::printCandidateDecision(const SchedCandidate &Current, + const SchedCandidate &Preferred) { + LLVM_DEBUG({ + dbgs() << "Prefer:\t\t"; + DAG->dumpNode(*Preferred.SU); + + if (Current.SU) { + dbgs() << "Not:\t"; + DAG->dumpNode(*Current.SU); + } + + dbgs() << "Reason:\t\t"; + traceCandidate(Preferred); + }); +} + // This function is mostly cut and pasted from // GenericScheduler::pickNodeFromQueue() void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy, const RegPressureTracker &RPTracker, - SchedCandidate &Cand, + SchedCandidate &Cand, bool &IsPending, bool IsBottomUp) { const SIRegisterInfo *SRI = static_cast(TRI); ArrayRef Pressure = RPTracker.getRegSetPressureAtPos(); unsigned SGPRPressure = 0; unsigned VGPRPressure = 0; + IsPending = false; if (DAG->isTrackingPressure()) { if (!GCNTrackers) { SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; @@ -358,8 +399,9 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, VGPRPressure = T->getPressure().getArchVGPRNum(); } } - ReadyQueue &Q = Zone.Available; - for (SUnit *SU : Q) { + LLVM_DEBUG(dbgs() << "Available Q:\n"); + ReadyQueue &AQ = Zone.Available; + for (SUnit *SU : AQ) { SchedCandidate TryCand(ZonePolicy); initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure, @@ -371,27 +413,55 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, // Initialize resource delta if needed in case future heuristics query it. if (TryCand.ResDelta == SchedResourceDelta()) TryCand.initResourceDelta(Zone.DAG, SchedModel); + LLVM_DEBUG(printCandidateDecision(Cand, TryCand)); Cand.setBest(TryCand); - LLVM_DEBUG(traceCandidate(Cand)); + } else { + printCandidateDecision(TryCand, Cand); + } + } + + if (!shouldCheckPending(Zone, SchedModel)) + return; + + LLVM_DEBUG(dbgs() << "Pending Q:\n"); + ReadyQueue &PQ = Zone.Pending; + for (SUnit *SU : PQ) { + + SchedCandidate TryCand(ZonePolicy); + initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure, + VGPRPressure, IsBottomUp); + // Pass SchedBoundary only when comparing nodes from the same boundary. + SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr; + tryPendingCandidate(Cand, TryCand, ZoneArg); + if (TryCand.Reason != NoCand) { + // Initialize resource delta if needed in case future heuristics query it. + if (TryCand.ResDelta == SchedResourceDelta()) + TryCand.initResourceDelta(Zone.DAG, SchedModel); + LLVM_DEBUG(printCandidateDecision(Cand, TryCand)); + IsPending = true; + Cand.setBest(TryCand); + } else { + printCandidateDecision(TryCand, Cand); } } } // This function is mostly cut and pasted from // GenericScheduler::pickNodeBidirectional() -SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { +SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode, + bool &PickedPending) { // Schedule as far as possible in the direction of no choice. This is most // efficient, but also provides the best heuristics for CriticalPSets. - if (SUnit *SU = Bot.pickOnlyChoice()) { + if (SUnit *SU = pickOnlyChoice(Bot, SchedModel)) { IsTopNode = false; return SU; } - if (SUnit *SU = Top.pickOnlyChoice()) { + if (SUnit *SU = pickOnlyChoice(Top, SchedModel)) { IsTopNode = true; return SU; } - // Set the bottom-up policy based on the state of the current bottom zone and - // the instructions outside the zone, including the top zone. + // Set the bottom-up policy based on the state of the current bottom zone + // and the instructions outside the zone, including the top zone. CandPolicy BotPolicy; setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top); // Set the top-down policy based on the state of the current top zone and @@ -399,12 +469,14 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { CandPolicy TopPolicy; setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot); + bool BotPending = false; // See if BotCand is still valid (because we previously scheduled from Top). LLVM_DEBUG(dbgs() << "Picking from Bot:\n"); if (!BotCand.isValid() || BotCand.SU->isScheduled || BotCand.Policy != BotPolicy) { BotCand.reset(CandPolicy()); pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand, + BotPending, /*IsBottomUp=*/true); assert(BotCand.Reason != NoCand && "failed to find the first candidate"); } else { @@ -414,6 +486,7 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { SchedCandidate TCand; TCand.reset(CandPolicy()); pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand, + BotPending, /*IsBottomUp=*/true); assert(TCand.SU == BotCand.SU && "Last pick result should correspond to re-picking right now"); @@ -421,12 +494,14 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { #endif } + bool TopPending = false; // Check if the top Q has a better candidate. LLVM_DEBUG(dbgs() << "Picking from Top:\n"); if (!TopCand.isValid() || TopCand.SU->isScheduled || TopCand.Policy != TopPolicy) { TopCand.reset(CandPolicy()); pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand, + TopPending, /*IsBottomUp=*/false); assert(TopCand.Reason != NoCand && "failed to find the first candidate"); } else { @@ -436,6 +511,7 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { SchedCandidate TCand; TCand.reset(CandPolicy()); pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand, + TopPending, /*IsBottomUp=*/false); assert(TCand.SU == TopCand.SU && "Last pick result should correspond to re-picking right now"); @@ -446,12 +522,21 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { // Pick best from BotCand and TopCand. LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand); dbgs() << "Bot Cand: "; traceCandidate(BotCand);); - SchedCandidate Cand = BotCand; - TopCand.Reason = NoCand; - tryCandidate(Cand, TopCand, nullptr); - if (TopCand.Reason != NoCand) { - Cand.setBest(TopCand); + SchedCandidate Cand = BotPending ? TopCand : BotCand; + SchedCandidate TryCand = BotPending ? BotCand : TopCand; + PickedPending = BotPending && TopPending; + + TryCand.Reason = NoCand; + if (BotPending || TopPending) { + PickedPending |= tryPendingCandidate(Cand, TopCand, nullptr); + } else { + tryCandidate(Cand, TryCand, nullptr); + } + + if (TryCand.Reason != NoCand) { + Cand.setBest(TryCand); } + LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand);); IsTopNode = Cand.AtTop; @@ -466,35 +551,55 @@ SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) { Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage"); return nullptr; } + bool PickedPending; SUnit *SU; do { + PickedPending = false; if (RegionPolicy.OnlyTopDown) { - SU = Top.pickOnlyChoice(); + SU = pickOnlyChoice(Top, SchedModel); if (!SU) { CandPolicy NoPolicy; TopCand.reset(NoPolicy); pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand, + PickedPending, /*IsBottomUp=*/false); assert(TopCand.Reason != NoCand && "failed to find a candidate"); SU = TopCand.SU; } IsTopNode = true; } else if (RegionPolicy.OnlyBottomUp) { - SU = Bot.pickOnlyChoice(); + SU = pickOnlyChoice(Bot, SchedModel); if (!SU) { CandPolicy NoPolicy; BotCand.reset(NoPolicy); pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand, + PickedPending, /*IsBottomUp=*/true); assert(BotCand.Reason != NoCand && "failed to find a candidate"); SU = BotCand.SU; } IsTopNode = false; } else { - SU = pickNodeBidirectional(IsTopNode); + SU = pickNodeBidirectional(IsTopNode, PickedPending); } } while (SU->isScheduled); + if (PickedPending) { + unsigned ReadyCycle = IsTopNode ? SU->TopReadyCycle : SU->BotReadyCycle; + SchedBoundary &Zone = IsTopNode ? Top : Bot; + unsigned CurrentCycle = Zone.getCurrCycle(); + if (ReadyCycle > CurrentCycle) + Zone.bumpCycle(ReadyCycle); + + // FIXME: checkHazard() doesn't give information about which cycle the + // hazard will resolve so just keep bumping the cycle by 1. This could be + // made more efficient if checkHazard() returned more details. + while (Zone.checkHazard(SU)) + Zone.bumpCycle(Zone.getCurrCycle() + 1); + + Zone.releasePending(); + } + if (SU->isTopReady()) Top.removeReady(SU); if (SU->isBottomReady()) @@ -540,6 +645,47 @@ GCNSchedStageID GCNSchedStrategy::getNextStage() const { return *std::next(CurrentStage); } +bool GCNSchedStrategy::tryPendingCandidate(SchedCandidate &Cand, + SchedCandidate &TryCand, + SchedBoundary *Zone) const { + // Initialize the candidate if needed. + if (!Cand.isValid()) { + TryCand.Reason = NodeOrder; + return true; + } + + // Bias PhysReg Defs and copies to their uses and defined respectively. + if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop), + biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg)) + return TryCand.Reason != NoCand; + + // Avoid exceeding the target's limit. + if (DAG->isTrackingPressure() && + tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand, + RegExcess, TRI, DAG->MF)) + return TryCand.Reason != NoCand; + + // Avoid increasing the max critical pressure in the scheduled region. + if (DAG->isTrackingPressure() && + tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax, + TryCand, Cand, RegCritical, TRI, DAG->MF)) + return TryCand.Reason != NoCand; + + bool SameBoundary = Zone != nullptr; + if (SameBoundary) { + TryCand.initResourceDelta(DAG, SchedModel); + if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, + TryCand, Cand, ResourceReduce)) + return TryCand.Reason != NoCand; + if (tryGreater(TryCand.ResDelta.DemandedResources, + Cand.ResDelta.DemandedResources, TryCand, Cand, + ResourceDemand)) + return TryCand.Reason != NoCand; + } + + return false; +} + GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy( const MachineSchedContext *C, bool IsLegacyScheduler) : GCNSchedStrategy(C) { diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index 8ea42677454e4..cab3cba3b35d7 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -44,17 +44,34 @@ raw_ostream &operator<<(raw_ostream &OS, const GCNSchedStageID &StageID); /// heuristics to determine excess/critical pressure sets. class GCNSchedStrategy : public GenericScheduler { protected: - SUnit *pickNodeBidirectional(bool &IsTopNode); + SUnit *pickNodeBidirectional(bool &IsTopNode, bool &PickedPending); void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy, const RegPressureTracker &RPTracker, - SchedCandidate &Cand, bool IsBottomUp); + SchedCandidate &Cand, bool &IsPending, + bool IsBottomUp); void initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop, const RegPressureTracker &RPTracker, const SIRegisterInfo *SRI, unsigned SGPRPressure, unsigned VGPRPressure, bool IsBottomUp); + /// Evaluates instructions in the pending queue using a subset of scheduling + /// heuristics. + /// + /// Instructions that cannot be issued due to hardware constraints are placed + /// in the pending queue rather than the available queue, making them normally + /// invisible to scheduling heuristics. However, in certain scenarios (such as + /// avoiding register spilling), it may be beneficial to consider scheduling + /// these not-yet-ready instructions. + bool tryPendingCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) const; + +#ifndef NDEBUG + void printCandidateDecision(const SchedCandidate &Current, + const SchedCandidate &Preferred); +#endif + std::vector Pressure; std::vector MaxPressure; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir index b07dec326327e..689d1472d6010 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir @@ -6,1153 +6,1147 @@ define amdgpu_kernel void @largeInterleave() #0 { ret void } ; GCN-LABEL: largeInterleave: ; GCN: ; %bb.0: - ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN-NEXT: ; implicit-def: $vgpr0 - ; GCN-NEXT: ; implicit-def: $vgpr2 - ; GCN-NEXT: ; implicit-def: $vgpr1 - ; GCN-NEXT: ; implicit-def: $vgpr8 - ; GCN-NEXT: ; implicit-def: $vgpr94 - ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 - ; GCN-NEXT: ; implicit-def: $vgpr106 - ; GCN-NEXT: ; implicit-def: $vgpr132 - ; GCN-NEXT: ; implicit-def: $vgpr133 - ; GCN-NEXT: ; implicit-def: $vgpr139 - ; GCN-NEXT: ; implicit-def: $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127 - ; GCN-NEXT: ; iglp_opt mask(0x00000002) - ; GCN-NEXT: ; implicit-def: $sgpr0 + ; GCN-NEXT: ; implicit-def: $vgpr16 + ; GCN-NEXT: ; implicit-def: $vgpr25 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) - ; GCN-NEXT: v_readfirstlane_b32 s7, v0 + ; GCN-NEXT: v_readfirstlane_b32 s17, v16 + ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GCN-NEXT: ; implicit-def: $vgpr17 + ; GCN-NEXT: ; implicit-def: $sgpr15 ; GCN-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 - ; GCN-NEXT: ; kill: killed $sgpr8_sgpr9_sgpr10_sgpr11 - ; GCN-NEXT: ; implicit-def: $sgpr5 - ; GCN-NEXT: s_nop 1 - ; GCN-NEXT: v_lshl_add_u32 v0, s7, 4, v2 - ; GCN-NEXT: v_mul_lo_u32 v0, v0, s6 - ; GCN-NEXT: v_add_lshl_u32 v92, v0, v1, 1 - ; GCN-NEXT: v_add_u32_e32 v93, s0, v92 - ; GCN-NEXT: buffer_load_dwordx4 v[0:3], v92, s[8:11], 0 offen sc0 sc1 + ; GCN-NEXT: s_lshl_b32 s18, s17, 7 + ; GCN-NEXT: ; implicit-def: $vgpr18 + ; GCN-NEXT: v_add_lshl_u32 v230, v18, s18, 1 + ; GCN-NEXT: v_lshl_add_u32 v25, s17, 4, v25 + ; GCN-NEXT: v_mul_lo_u32 v25, v25, s6 + ; GCN-NEXT: v_add_lshl_u32 v226, v25, v17, 1 + ; GCN-NEXT: v_add_u32_e32 v17, s15, v226 + ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v226, s[8:11], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx4 v[4:7], v93, s[8:11], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v17, s[8:11], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: s_lshl_b32 s0, s7, 7 - ; GCN-NEXT: v_add_lshl_u32 v95, v8, s0, 1 - ; GCN-NEXT: v_add_u32_e32 v8, 64, v93 - ; GCN-NEXT: ; kill: killed $vgpr8 + ; GCN-NEXT: v_add_u32_e32 v72, 64, v17 + ; GCN-NEXT: ; implicit-def: $vgpr213 + ; GCN-NEXT: ; implicit-def: $vgpr152_vgpr153_vgpr154_vgpr155 + ; GCN-NEXT: ; implicit-def: $vgpr246 + ; GCN-NEXT: v_add_u32_e32 v188, 0x80, v17 + ; GCN-NEXT: ; implicit-def: $vgpr156_vgpr157_vgpr158_vgpr159 + ; GCN-NEXT: ; implicit-def: $vgpr144_vgpr145_vgpr146_vgpr147 + ; GCN-NEXT: ; implicit-def: $vgpr19 + ; GCN-NEXT: ; implicit-def: $vgpr26 + ; GCN-NEXT: ; implicit-def: $vgpr27 + ; GCN-NEXT: v_add_u32_e32 v227, 0xc0, v17 + ; GCN-NEXT: v_add_u32_e32 v231, v19, v26 + ; GCN-NEXT: v_add_u32_e32 v232, v19, v27 ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: ; kill: killed $vgpr92 - ; GCN-NEXT: ; implicit-def: $sgpr6 + ; GCN-NEXT: ; implicit-def: $vgpr28 + ; GCN-NEXT: ; implicit-def: $vgpr29 + ; GCN-NEXT: v_add_u32_e32 v233, v19, v28 + ; GCN-NEXT: v_add_u32_e32 v234, v19, v29 + ; GCN-NEXT: ; implicit-def: $vgpr140_vgpr141_vgpr142_vgpr143 + ; GCN-NEXT: ; implicit-def: $sgpr5 + ; GCN-NEXT: ; implicit-def: $sgpr7 + ; GCN-NEXT: ; implicit-def: $vgpr148_vgpr149_vgpr150_vgpr151 + ; GCN-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139 + ; GCN-NEXT: ; implicit-def: $vgpr132_vgpr133_vgpr134_vgpr135 + ; GCN-NEXT: ; implicit-def: $vgpr20 + ; GCN-NEXT: v_add_u32_e32 v18, s17, v20 + ; GCN-NEXT: v_and_b32_e32 v18, 0x1fffffff, v18 + ; GCN-NEXT: ; implicit-def: $sgpr16 + ; GCN-NEXT: v_mul_lo_u32 v18, v18, s16 + ; GCN-NEXT: ; implicit-def: $vgpr21 + ; GCN-NEXT: v_add_lshl_u32 v199, v21, v18, 1 + ; GCN-NEXT: ; implicit-def: $vgpr22 + ; GCN-NEXT: v_lshl_add_u32 v200, v22, 1, v199 + ; GCN-NEXT: ; implicit-def: $vgpr23 + ; GCN-NEXT: v_lshl_add_u32 v201, v23, 1, v200 + ; GCN-NEXT: ; implicit-def: $vgpr24 + ; GCN-NEXT: v_lshl_add_u32 v202, v24, 1, v201 + ; GCN-NEXT: ; implicit-def: $vgpr16 + ; GCN-NEXT: ; implicit-def: $vgpr18 + ; GCN-NEXT: ; implicit-def: $vgpr20 + ; GCN-NEXT: ; implicit-def: $vgpr24 + ; GCN-NEXT: v_add_u32_e32 v247, v19, v24 + ; GCN-NEXT: v_add_u32_e32 v248, v19, v16 + ; GCN-NEXT: v_add_u32_e32 v249, v19, v18 + ; GCN-NEXT: v_add_u32_e32 v250, v19, v20 + ; GCN-NEXT: ; implicit-def: $vgpr128_vgpr129_vgpr130_vgpr131 + ; GCN-NEXT: ; implicit-def: $sgpr14 + ; GCN-NEXT: ; implicit-def: $vgpr196 + ; GCN-NEXT: ; implicit-def: $sgpr12_sgpr13 + ; GCN-NEXT: ; implicit-def: $vgpr211 + ; GCN-NEXT: v_max_f32_e32 v212, v211, v211 + ; GCN-NEXT: ; implicit-def: $vgpr198 + ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GCN-NEXT: ; implicit-def: $vgpr32 + ; GCN-NEXT: ; implicit-def: $vgpr33 + ; GCN-NEXT: ; implicit-def: $vgpr34 + ; GCN-NEXT: v_add_u32_e32 v210, v19, v34 + ; GCN-NEXT: v_add_u32_e32 v206, v19, v33 + ; GCN-NEXT: v_add_u32_e32 v205, v19, v32 + ; GCN-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 + ; GCN-NEXT: ; implicit-def: $vgpr21 + ; GCN-NEXT: ; implicit-def: $vgpr22 + ; GCN-NEXT: ; implicit-def: $vgpr23 + ; GCN-NEXT: ; implicit-def: $vgpr30 + ; GCN-NEXT: ; implicit-def: $vgpr31 + ; GCN-NEXT: v_add_u32_e32 v207, v19, v21 + ; GCN-NEXT: v_add_u32_e32 v208, v19, v22 + ; GCN-NEXT: v_add_u32_e32 v209, v19, v23 + ; GCN-NEXT: v_add_u32_e32 v203, v19, v30 + ; GCN-NEXT: v_add_u32_e32 v204, v19, v31 + ; GCN-NEXT: ; kill: killed $vgpr17 + ; GCN-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; GCN-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 + ; GCN-NEXT: ; implicit-def: $vgpr197 + ; GCN-NEXT: ; iglp_opt mask(0x00000002) ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b128 v95, v[0:3] + ; GCN-NEXT: ds_write_b128 v230, v[64:67] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b128 v95, v[4:7] offset:1024 + ; GCN-NEXT: ds_write_b128 v230, v[68:71] offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:64 sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[160:163], v226, s[8:11], 0 offen offset:64 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v8, s[8:11], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[164:167], v72, s[8:11], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: ds_read_b128 v[72:75], v94 + ; GCN-NEXT: ds_read_b128 v[64:67], v213 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[80:83], v94 offset:512 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[64:65], v[152:153], 0 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[66:67], v[154:155], v[112:127] + ; GCN-NEXT: ds_read_b128 v[64:67], v213 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[84:87], v94 offset:1024 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[64:65], v[152:153], 0 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[66:67], v[154:155], v[96:111] + ; GCN-NEXT: ds_read_b128 v[64:67], v213 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], 0 - ; GCN-NEXT: ds_read_b128 v[88:91], v94 offset:1536 + ; GCN-NEXT: ds_read_b128 v[168:171], v213 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 + ; GCN-NEXT: ds_read_b128 v[172:175], v246 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], 0 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], 0 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], 0 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47] - ; GCN-NEXT: ds_read_b128 v[80:83], v106 offset:512 + ; GCN-NEXT: ds_read_b128 v[176:179], v246 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31] - ; GCN-NEXT: ds_read_b128 v[84:87], v106 offset:1024 + ; GCN-NEXT: ds_read_b128 v[180:183], v246 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15] - ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 - ; GCN-NEXT: ds_read_b128 v[88:91], v106 offset:1536 + ; GCN-NEXT: ds_read_b128 v[184:187], v246 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[64:65], v[152:153], 0 ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b128 v95, v[64:67] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63] - ; GCN-NEXT: v_add_u32_e32 v72, 0x80, v93 + ; GCN-NEXT: ds_write_b128 v230, v[160:163] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[66:67], v[154:155], v[80:95] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b128 v95, v[68:71] offset:1024 + ; GCN-NEXT: ds_write_b128 v230, v[164:167] offset:1024 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[168:169], v[152:153], 0 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[170:171], v[154:155], v[64:79] ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:128 sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[152:155], v226, s[8:11], 0 offen offset:128 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v72, s[8:11], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[160:163], v188, s[8:11], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: ; kill: killed $vgpr72 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] - ; GCN-NEXT: ds_read_b128 v[72:75], v94 + ; GCN-NEXT: ds_read_b128 v[188:191], v213 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], v[32:47] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], v[16:31] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], v[0:15] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47] - ; GCN-NEXT: ds_read_b128 v[80:83], v94 offset:512 + ; GCN-NEXT: ds_read_b128 v[192:195], v213 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31] - ; GCN-NEXT: ds_read_b128 v[84:87], v94 offset:1024 + ; GCN-NEXT: ds_read_b128 v[164:167], v213 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15] - ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 - ; GCN-NEXT: ds_read_b128 v[88:91], v94 offset:1536 + ; GCN-NEXT: ds_read_b128 v[214:217], v213 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[172:173], v[156:157], v[112:127] + ; GCN-NEXT: ds_read_b128 v[218:221], v246 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], v[32:47] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], v[16:31] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], v[0:15] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15] - ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:512 + ; GCN-NEXT: ds_read_b128 v[222:225], v246 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1024 + ; GCN-NEXT: ds_read_b128 v[168:171], v246 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1536 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[174:175], v[158:159], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[188:189], v[144:145], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[190:191], v[146:147], v[112:127] + ; GCN-NEXT: ds_read_b128 v[188:191], v246 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b128 v95, v[64:67] + ; GCN-NEXT: ds_write_b128 v230, v[152:155] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b128 v95, v[68:71] offset:1024 - ; GCN-NEXT: ; implicit-def: $vgpr64 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15] - ; GCN-NEXT: v_add_u32_e32 v72, 0xc0, v93 - ; GCN-NEXT: ; implicit-def: $vgpr73 - ; GCN-NEXT: v_add_u32_e32 v76, v132, v64 + ; GCN-NEXT: ds_write_b128 v230, v[160:163] offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:192 sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[152:155], v226, s[8:11], 0 offen offset:192 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v72, s[8:11], 0 offen sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[184:185], v[156:157], v[64:79] + ; GCN-NEXT: buffer_load_dwordx4 v[226:229], v227, s[8:11], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ; kill: killed $vgpr72 - ; GCN-NEXT: v_add_u32_e32 v72, v132, v73 - ; GCN-NEXT: buffer_load_dwordx2 v[98:99], v76, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[160:161], v231, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[102:103], v72, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[162:163], v232, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15] - ; GCN-NEXT: ; implicit-def: $vgpr74 - ; GCN-NEXT: v_add_u32_e32 v72, v132, v74 - ; GCN-NEXT: ; implicit-def: $vgpr75 - ; GCN-NEXT: buffer_load_dwordx2 v[100:101], v72, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[172:173], v233, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_add_u32_e32 v72, v132, v75 - ; GCN-NEXT: buffer_load_dwordx2 v[104:105], v72, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[174:175], v234, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: ds_read_b128 v[72:75], v94 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[186:187], v[158:159], v[64:79] + ; GCN-NEXT: v_perm_b32 v238, v162, v160, s5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[218:219], v[140:141], v[112:127] + ; GCN-NEXT: v_perm_b32 v240, v162, v160, s7 + ; GCN-NEXT: v_perm_b32 v242, v163, v161, s5 + ; GCN-NEXT: v_perm_b32 v244, v163, v161, s7 + ; GCN-NEXT: ds_read_b128 v[160:163], v213 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ; kill: killed $vgpr76 - ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 - ; GCN-NEXT: ; implicit-def: $sgpr8 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] - ; GCN-NEXT: ds_read_b128 v[72:75], v94 offset:512 + ; GCN-NEXT: v_perm_b32 v239, v174, v172, s5 + ; GCN-NEXT: v_perm_b32 v241, v174, v172, s7 + ; GCN-NEXT: v_perm_b32 v243, v175, v173, s5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[214:215], v[144:145], v[64:79] + ; GCN-NEXT: v_perm_b32 v245, v175, v173, s7 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[176:177], v[156:157], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[220:221], v[142:143], v[112:127] + ; GCN-NEXT: ds_read_b128 v[218:221], v213 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47] - ; GCN-NEXT: ds_read_b128 v[72:75], v94 offset:1024 + ; GCN-NEXT: ds_read_b128 v[172:175], v213 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31] - ; GCN-NEXT: ds_read_b128 v[72:75], v94 offset:1536 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[216:217], v[146:147], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[178:179], v[158:159], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[160:161], v[148:149], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[188:189], v[140:141], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[192:193], v[144:145], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[162:163], v[150:151], v[112:127] + ; GCN-NEXT: ds_read_b128 v[160:163], v213 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 + ; GCN-NEXT: ds_read_b128 v[184:187], v246 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:512 + ; GCN-NEXT: ds_read_b128 v[214:217], v246 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1024 + ; GCN-NEXT: ds_read_b128 v[176:179], v246 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1536 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[190:191], v[142:143], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[194:195], v[146:147], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[160:161], v[148:149], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[180:181], v[156:157], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[184:185], v[136:137], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[222:223], v[140:141], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[162:163], v[150:151], v[64:79] + ; GCN-NEXT: ds_read_b128 v[160:163], v246 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b128 v95, v[64:67] + ; GCN-NEXT: ds_write_b128 v230, v[152:155] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b128 v95, v[68:71] offset:1024 + ; GCN-NEXT: ds_write_b128 v230, v[226:229] offset:1024 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[182:183], v[158:159], v[80:95] ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_read_b128 v[64:67], v94 + ; GCN-NEXT: ds_read_b128 v[156:159], v213 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[90:93], v94 offset:512 + ; GCN-NEXT: ds_read_b128 v[226:229], v213 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15] - ; GCN-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71 - ; GCN-NEXT: ds_read_b128 v[84:87], v94 offset:1024 + ; GCN-NEXT: ds_read_b128 v[180:183], v213 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[64:65], v[68:69], v[48:63] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15] - ; GCN-NEXT: ds_read_b128 v[76:79], v94 offset:1536 + ; GCN-NEXT: ds_read_b128 v[152:155], v213 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[94:97], v106 + ; GCN-NEXT: ds_read_b128 v[230:233], v246 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[66:67], v[70:71], v[48:63] - ; GCN-NEXT: ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[90:91], v[68:69], v[32:47] - ; GCN-NEXT: ds_read_b128 v[88:91], v106 offset:512 + ; GCN-NEXT: ds_read_b128 v[234:237], v246 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[80:83], v106 offset:1024 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[186:187], v[138:139], v[112:127] + ; GCN-NEXT: ds_read_b128 v[184:187], v246 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1536 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[224:225], v[142:143], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[156:157], v[132:133], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[218:219], v[148:149], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[158:159], v[134:135], v[112:127] + ; GCN-NEXT: ds_read_b128 v[156:159], v246 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[94:95], v[64:65], v[48:63] - ; GCN-NEXT: v_perm_b32 v94, v102, v98, s5 - ; GCN-NEXT: v_perm_b32 v98, v102, v98, s8 - ; GCN-NEXT: v_perm_b32 v102, v103, v99, s5 - ; GCN-NEXT: v_perm_b32 v95, v104, v100, s5 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[92:93], v[70:71], v[32:47] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[68:69], v[16:31] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[96:97], v[66:67], v[48:63] - ; GCN-NEXT: v_perm_b32 v96, v103, v99, s8 - ; GCN-NEXT: v_perm_b32 v99, v104, v100, s8 - ; GCN-NEXT: v_perm_b32 v103, v105, v101, s5 - ; GCN-NEXT: v_perm_b32 v97, v105, v101, s8 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[88:89], v[64:65], v[32:47] - ; GCN-NEXT: s_nop 5 - ; GCN-NEXT: v_mul_f32_e32 v100, s4, v48 - ; GCN-NEXT: v_mul_f32_e32 v101, s4, v49 - ; GCN-NEXT: v_max3_f32 v92, v100, s6, v101 - ; GCN-NEXT: v_mul_f32_e32 v93, s4, v50 - ; GCN-NEXT: v_mul_f32_e32 v100, s4, v51 - ; GCN-NEXT: v_max3_f32 v92, v92, v93, v100 - ; GCN-NEXT: v_mul_f32_e32 v93, s4, v52 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[70:71], v[16:31] - ; GCN-NEXT: v_mul_f32_e32 v100, s4, v53 - ; GCN-NEXT: v_max3_f32 v92, v92, v93, v100 - ; GCN-NEXT: v_mul_f32_e32 v84, s4, v54 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v55 - ; GCN-NEXT: v_max3_f32 v84, v92, v84, v85 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v56 - ; GCN-NEXT: v_mul_f32_e32 v92, s4, v57 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[76:77], v[68:69], v[0:15] - ; GCN-NEXT: v_max3_f32 v84, v84, v85, v92 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v58 - ; GCN-NEXT: v_mul_f32_e32 v88, s4, v59 - ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v60 - ; GCN-NEXT: v_mul_f32_e32 v88, s4, v61 - ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[90:91], v[66:67], v[32:47] - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v62 - ; GCN-NEXT: v_mul_f32_e32 v88, s4, v63 - ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88 - ; GCN-NEXT: ; implicit-def: $sgpr6 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[80:81], v[64:65], v[16:31] - ; GCN-NEXT: s_nop 6 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v32 - ; GCN-NEXT: v_mul_f32_e32 v88, s4, v33 - ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v34 - ; GCN-NEXT: v_mul_f32_e32 v88, s4, v35 - ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v36 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[78:79], v[70:71], v[0:15] - ; GCN-NEXT: v_mul_f32_e32 v86, s4, v37 - ; GCN-NEXT: v_max3_f32 v84, v84, v85, v86 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v38 - ; GCN-NEXT: v_mul_f32_e32 v86, s4, v39 - ; GCN-NEXT: v_max3_f32 v84, v84, v85, v86 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v40 - ; GCN-NEXT: v_mul_f32_e32 v80, s4, v41 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[82:83], v[66:67], v[16:31] - ; GCN-NEXT: v_max3_f32 v80, v84, v85, v80 - ; GCN-NEXT: v_mul_f32_e32 v81, s4, v42 - ; GCN-NEXT: v_mul_f32_e32 v84, s4, v43 - ; GCN-NEXT: v_max3_f32 v80, v80, v81, v84 - ; GCN-NEXT: v_mul_f32_e32 v81, s4, v44 - ; GCN-NEXT: v_mul_f32_e32 v84, s4, v45 - ; GCN-NEXT: v_max3_f32 v80, v80, v81, v84 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[64:65], v[0:15] - ; GCN-NEXT: v_mul_f32_e32 v81, s4, v46 - ; GCN-NEXT: v_mul_f32_e32 v82, s4, v47 - ; GCN-NEXT: v_max3_f32 v80, v80, v81, v82 - ; GCN-NEXT: v_mul_f32_e32 v81, s4, v16 - ; GCN-NEXT: v_mul_f32_e32 v82, s4, v17 - ; GCN-NEXT: v_max3_f32 v80, v80, v81, v82 - ; GCN-NEXT: v_mul_f32_e32 v68, s4, v18 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[66:67], v[0:15] - ; GCN-NEXT: v_mul_f32_e32 v69, s4, v19 - ; GCN-NEXT: v_max3_f32 v68, v80, v68, v69 - ; GCN-NEXT: v_mul_f32_e32 v69, s4, v20 - ; GCN-NEXT: v_mul_f32_e32 v76, s4, v21 - ; GCN-NEXT: v_max3_f32 v68, v68, v69, v76 - ; GCN-NEXT: v_mul_f32_e32 v69, s4, v22 - ; GCN-NEXT: v_mul_f32_e32 v70, s4, v23 - ; GCN-NEXT: v_max3_f32 v68, v68, v69, v70 - ; GCN-NEXT: v_mul_f32_e32 v69, s4, v24 - ; GCN-NEXT: v_mul_f32_e32 v70, s4, v25 - ; GCN-NEXT: v_max3_f32 v68, v68, v69, v70 - ; GCN-NEXT: v_mul_f32_e32 v69, s4, v26 - ; GCN-NEXT: v_mul_f32_e32 v70, s4, v27 - ; GCN-NEXT: v_max3_f32 v64, v68, v69, v70 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v28 - ; GCN-NEXT: v_mul_f32_e32 v68, s4, v29 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v68 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v30 - ; GCN-NEXT: v_mul_f32_e32 v68, s4, v31 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v68 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v0 - ; GCN-NEXT: v_mul_f32_e32 v66, s4, v1 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v2 - ; GCN-NEXT: v_mul_f32_e32 v66, s4, v3 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v4 - ; GCN-NEXT: v_mul_f32_e32 v66, s4, v5 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v6 - ; GCN-NEXT: v_mul_f32_e32 v66, s4, v7 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v8 - ; GCN-NEXT: v_mul_f32_e32 v66, s4, v9 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v10 - ; GCN-NEXT: v_mul_f32_e32 v66, s4, v11 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v12 - ; GCN-NEXT: v_mul_f32_e32 v66, s4, v13 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v14 - ; GCN-NEXT: v_mul_f32_e32 v66, s4, v15 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 - ; GCN-NEXT: ; implicit-def: $vgpr65 - ; GCN-NEXT: ; implicit-def: $vgpr66 - ; GCN-NEXT: ; implicit-def: $vgpr68 - ; GCN-NEXT: ; implicit-def: $vgpr67 - ; GCN-NEXT: v_add_u32_e32 v65, s7, v65 - ; GCN-NEXT: v_and_b32_e32 v65, 0x1fffffff, v65 - ; GCN-NEXT: v_mul_lo_u32 v65, v65, s6 - ; GCN-NEXT: v_add_lshl_u32 v135, v66, v65, 1 - ; GCN-NEXT: ds_bpermute_b32 v65, v133, v64 - ; GCN-NEXT: ; implicit-def: $vgpr66 - ; GCN-NEXT: v_lshl_add_u32 v136, v66, 1, v135 - ; GCN-NEXT: ; implicit-def: $vgpr66 - ; GCN-NEXT: v_lshl_add_u32 v137, v66, 1, v136 - ; GCN-NEXT: ; implicit-def: $vgpr66 - ; GCN-NEXT: ; implicit-def: $sgpr6_sgpr7 - ; GCN-NEXT: v_lshl_add_u32 v138, v66, 1, v137 ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v135, v[94:95] - ; GCN-NEXT: v_max_f32_e32 v65, v65, v65 - ; GCN-NEXT: v_max_f32_e32 v64, v64, v65 - ; GCN-NEXT: ds_bpermute_b32 v65, v133, v64 + ; GCN-NEXT: ds_write_b64 v199, v[238:239] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v136, v[98:99] + ; GCN-NEXT: ds_write_b64 v200, v[240:241] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v137, v[102:103] + ; GCN-NEXT: ds_write_b64 v201, v[242:243] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v138, v[96:97] - ; GCN-NEXT: v_add_u32_e32 v68, v132, v68 - ; GCN-NEXT: v_cndmask_b32_e64 v64, v65, v64, s[6:7] - ; GCN-NEXT: v_max_f32_e32 v64, v64, v64 - ; GCN-NEXT: ; implicit-def: $vgpr65 - ; GCN-NEXT: v_max_f32_e32 v66, v65, v65 - ; GCN-NEXT: v_max_f32_e32 v134, v66, v64 - ; GCN-NEXT: ; implicit-def: $vgpr64 + ; GCN-NEXT: ds_write_b64 v202, v[244:245] ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx2 v[156:157], v68, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[192:193], v247, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_add_u32_e32 v64, v132, v64 - ; GCN-NEXT: buffer_load_dwordx2 v[158:159], v64, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[220:221], v[150:151], v[96:111] + ; GCN-NEXT: buffer_load_dwordx2 v[194:195], v248, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ; implicit-def: $vgpr66 - ; GCN-NEXT: v_add_u32_e32 v64, v132, v66 - ; GCN-NEXT: buffer_load_dwordx2 v[128:129], v64, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[218:219], v249, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_add_u32_e32 v64, v132, v67 - ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v64, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[220:221], v250, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v57, s4, v57, -v134 - ; GCN-NEXT: v_fma_f32 v48, s4, v48, -v134 - ; GCN-NEXT: v_fma_f32 v96, s4, v58, -v134 - ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v57 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v48 - ; GCN-NEXT: v_fma_f32 v64, s4, v49, -v134 - ; GCN-NEXT: v_exp_f32_e32 v163, v57 - ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v96 - ; GCN-NEXT: v_fma_f32 v66, s4, v50, -v134 - ; GCN-NEXT: v_exp_f32_e32 v164, v57 - ; GCN-NEXT: v_exp_f32_e32 v49, v48 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v64 - ; GCN-NEXT: v_fma_f32 v67, s4, v51, -v134 - ; GCN-NEXT: v_exp_f32_e32 v50, v48 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v66 - ; GCN-NEXT: v_fma_f32 v68, s4, v52, -v134 - ; GCN-NEXT: v_exp_f32_e32 v51, v48 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v67 - ; GCN-NEXT: v_fma_f32 v69, s4, v53, -v134 - ; GCN-NEXT: v_exp_f32_e32 v52, v48 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v68 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_fma_f32 v70, s4, v54, -v134 - ; GCN-NEXT: v_exp_f32_e32 v53, v48 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v69 - ; GCN-NEXT: v_fma_f32 v71, s4, v55, -v134 - ; GCN-NEXT: ds_read_b128 v[140:143], v139 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v54, v48 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v70 - ; GCN-NEXT: v_exp_f32_e32 v55, v48 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v71 - ; GCN-NEXT: ds_read_b128 v[144:147], v139 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v66, s4, v56, -v134 - ; GCN-NEXT: v_exp_f32_e32 v56, v48 - ; GCN-NEXT: v_sub_f32_e32 v48, v65, v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v49 - ; GCN-NEXT: v_cvt_f16_f32_e32 v67, v50 - ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v51 - ; GCN-NEXT: v_cvt_f16_f32_e32 v58, v52 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v48 - ; GCN-NEXT: ds_read_b128 v[148:151], v139 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v48, v48 - ; GCN-NEXT: v_pack_b32_f16 v161, v68, v58 - ; GCN-NEXT: v_pack_b32_f16 v160, v64, v67 - ; GCN-NEXT: v_mul_f32_e32 v58, 0x3fb8aa3b, v66 - ; GCN-NEXT: ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 - ; GCN-NEXT: ds_read_b128 v[152:155], v139 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v162, s4, v61, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v55 - ; GCN-NEXT: v_cvt_f16_f32_e32 v57, v56 - ; GCN-NEXT: v_pk_mul_f32 v[64:65], v[64:65], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[66:67], v[66:67], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[68:69], v[68:69], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[70:71], v[70:71], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[72:73], v[72:73], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[74:75], v[74:75], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[76:77], v[76:77], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[78:79], v[78:79], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: ; implicit-def: $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 - ; GCN-NEXT: v_fma_f32 v59, s4, v59, -v134 - ; GCN-NEXT: v_pk_mul_f32 v[80:81], v[80:81], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[140:141], v[160:161], v[64:79] - ; GCN-NEXT: v_mul_f32_e64 v82, v82, v48 - ; GCN-NEXT: v_mul_f32_e64 v83, v83, v48 - ; GCN-NEXT: v_mul_f32_e64 v84, v84, v48 - ; GCN-NEXT: v_mul_f32_e64 v85, v85, v48 - ; GCN-NEXT: v_mul_f32_e64 v86, v86, v48 - ; GCN-NEXT: v_mul_f32_e64 v87, v87, v48 - ; GCN-NEXT: v_pk_mul_f32 v[88:89], v[88:89], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[90:91], v[90:91], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[92:93], v[92:93], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[94:95], v[94:95], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: ; implicit-def: $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111 - ; GCN-NEXT: v_exp_f32_e32 v58, v58 - ; GCN-NEXT: v_pk_mul_f32 v[96:97], v[96:97], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[144:145], v[160:161], v[80:95] - ; GCN-NEXT: v_mul_f32_e64 v98, v98, v48 - ; GCN-NEXT: v_mul_f32_e64 v99, v99, v48 - ; GCN-NEXT: v_mul_f32_e64 v100, v100, v48 - ; GCN-NEXT: v_mul_f32_e64 v101, v101, v48 - ; GCN-NEXT: v_mul_f32_e64 v102, v102, v48 - ; GCN-NEXT: v_mul_f32_e64 v103, v103, v48 - ; GCN-NEXT: v_pk_mul_f32 v[104:105], v[104:105], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[106:107], v[106:107], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[108:109], v[108:109], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[110:111], v[110:111], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pack_b32_f16 v145, v61, v57 - ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v59 - ; GCN-NEXT: v_cvt_f16_f32_e32 v140, v53 - ; GCN-NEXT: v_cvt_f16_f32_e32 v141, v54 - ; GCN-NEXT: v_exp_f32_e32 v59, v57 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[148:149], v[160:161], v[96:111] - ; GCN-NEXT: v_fma_f32 v60, s4, v60, -v134 - ; GCN-NEXT: v_mul_f32_e64 v112, v112, v48 - ; GCN-NEXT: v_mul_f32_e64 v113, v113, v48 - ; GCN-NEXT: v_mul_f32_e64 v114, v114, v48 - ; GCN-NEXT: v_mul_f32_e64 v115, v115, v48 - ; GCN-NEXT: v_pk_mul_f32 v[116:117], v[116:117], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[118:119], v[118:119], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[120:121], v[120:121], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[122:123], v[122:123], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[124:125], v[124:125], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[126:127], v[126:127], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_fma_f32 v148, s4, v62, -v134 - ; GCN-NEXT: v_pack_b32_f16 v144, v140, v141 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[152:153], v[160:161], v[112:127] - ; GCN-NEXT: v_fma_f32 v152, s4, v63, -v134 - ; GCN-NEXT: v_mul_f32_e32 v149, 0x3fb8aa3b, v60 - ; GCN-NEXT: ; implicit-def: $vgpr57 - ; GCN-NEXT: ds_read_b128 v[60:63], v57 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v160, v149 - ; GCN-NEXT: v_fma_f32 v161, s4, v33, -v134 - ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v148 - ; GCN-NEXT: v_cvt_f16_f32_e32 v153, v58 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[142:143], v[144:145], v[64:79] - ; GCN-NEXT: v_fma_f32 v32, s4, v32, -v134 - ; GCN-NEXT: ds_read_b128 v[140:143], v57 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v40, s4, v40, -v134 - ; GCN-NEXT: v_fma_f32 v44, s4, v44, -v134 - ; GCN-NEXT: v_fma_f32 v16, s4, v16, -v134 - ; GCN-NEXT: v_fma_f32 v166, s4, v20, -v134 - ; GCN-NEXT: v_fma_f32 v24, s4, v24, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[146:147], v[144:145], v[80:95] - ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v162 - ; GCN-NEXT: v_cvt_f16_f32_e32 v147, v163 - ; GCN-NEXT: v_exp_f32_e32 v162, v146 - ; GCN-NEXT: v_cvt_f16_f32_e32 v146, v164 - ; GCN-NEXT: v_fma_f32 v28, s4, v28, -v134 - ; GCN-NEXT: v_pack_b32_f16 v148, v153, v147 - ; GCN-NEXT: v_fma_f32 v0, s4, v0, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[150:151], v[144:145], v[96:111] - ; GCN-NEXT: v_exp_f32_e32 v151, v33 - ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v59 - ; GCN-NEXT: v_fma_f32 v150, s4, v34, -v134 - ; GCN-NEXT: v_fma_f32 v8, s4, v8, -v134 - ; GCN-NEXT: v_fma_f32 v12, s4, v12, -v134 - ; GCN-NEXT: v_pack_b32_f16 v149, v146, v33 - ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v152 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[154:155], v[144:145], v[112:127] - ; GCN-NEXT: v_fma_f32 v152, s4, v35, -v134 - ; GCN-NEXT: v_exp_f32_e32 v153, v33 - ; GCN-NEXT: v_fma_f32 v155, s4, v36, -v134 - ; GCN-NEXT: v_perm_b32 v36, v158, v156, s5 - ; GCN-NEXT: v_cvt_f16_f32_e32 v154, v160 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[60:61], v[148:149], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v60, 0x3fb8aa3b, v32 - ; GCN-NEXT: ds_read_b128 v[32:35], v57 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[144:147], v57 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v61, 0x3fb8aa3b, v161 - ; GCN-NEXT: v_exp_f32_e32 v165, v60 - ; GCN-NEXT: v_perm_b32 v60, v158, v156, s8 - ; GCN-NEXT: v_fma_f32 v158, s4, v37, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[140:141], v[148:149], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v161, v61 - ; GCN-NEXT: v_perm_b32 v140, v159, v157, s8 - ; GCN-NEXT: v_perm_b32 v37, v130, v128, s5 - ; GCN-NEXT: v_perm_b32 v61, v130, v128, s8 - ; GCN-NEXT: v_perm_b32 v141, v131, v129, s8 + ; GCN-NEXT: v_perm_b32 v188, v194, v192, s5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[164:165], v[144:145], v[80:95] + ; GCN-NEXT: v_perm_b32 v189, v220, v218, s5 + ; GCN-NEXT: v_perm_b32 v191, v220, v218, s7 + ; GCN-NEXT: v_perm_b32 v190, v194, v192, s7 + ; GCN-NEXT: v_perm_b32 v192, v195, v193, s5 + ; GCN-NEXT: v_perm_b32 v194, v195, v193, s7 + ; GCN-NEXT: v_perm_b32 v193, v221, v219, s5 + ; GCN-NEXT: v_perm_b32 v195, v221, v219, s7 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[166:167], v[146:147], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[168:169], v[140:141], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[170:171], v[142:143], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[172:173], v[148:149], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[214:215], v[136:137], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[174:175], v[150:151], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[216:217], v[138:139], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[176:177], v[136:137], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[226:227], v[132:133], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[178:179], v[138:139], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[160:161], v[136:137], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[230:231], v[128:129], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[228:229], v[134:135], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[180:181], v[132:133], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[162:163], v[138:139], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[232:233], v[130:131], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[234:235], v[128:129], v[96:111] + ; GCN-NEXT: s_nop 9 + ; GCN-NEXT: v_mul_f32_e32 v213, s4, v112 + ; GCN-NEXT: v_mul_f32_e32 v218, s4, v113 + ; GCN-NEXT: v_max3_f32 v213, v213, s14, v218 + ; GCN-NEXT: v_mul_f32_e32 v218, s4, v114 + ; GCN-NEXT: v_mul_f32_e32 v219, s4, v115 + ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219 + ; GCN-NEXT: v_mul_f32_e32 v218, s4, v116 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[182:183], v[134:135], v[80:95] + ; GCN-NEXT: v_mul_f32_e32 v219, s4, v117 + ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219 + ; GCN-NEXT: v_mul_f32_e32 v218, s4, v118 + ; GCN-NEXT: v_mul_f32_e32 v219, s4, v119 + ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219 + ; GCN-NEXT: v_mul_f32_e32 v218, s4, v120 + ; GCN-NEXT: v_mul_f32_e32 v219, s4, v121 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[152:153], v[132:133], v[64:79] + ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219 + ; GCN-NEXT: v_mul_f32_e32 v218, s4, v122 + ; GCN-NEXT: v_mul_f32_e32 v219, s4, v123 + ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219 + ; GCN-NEXT: v_mul_f32_e32 v218, s4, v124 + ; GCN-NEXT: v_mul_f32_e32 v219, s4, v125 + ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[236:237], v[130:131], v[96:111] + ; GCN-NEXT: v_mul_f32_e32 v218, s4, v126 + ; GCN-NEXT: v_mul_f32_e32 v219, s4, v127 + ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[184:185], v[128:129], v[80:95] + ; GCN-NEXT: s_nop 6 + ; GCN-NEXT: v_mul_f32_e32 v214, s4, v96 + ; GCN-NEXT: v_mul_f32_e32 v215, s4, v97 + ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215 + ; GCN-NEXT: v_mul_f32_e32 v214, s4, v98 + ; GCN-NEXT: v_mul_f32_e32 v215, s4, v99 + ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215 + ; GCN-NEXT: v_mul_f32_e32 v214, s4, v100 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[154:155], v[134:135], v[64:79] + ; GCN-NEXT: v_mul_f32_e32 v215, s4, v101 + ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215 + ; GCN-NEXT: v_mul_f32_e32 v214, s4, v102 + ; GCN-NEXT: v_mul_f32_e32 v215, s4, v103 + ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215 + ; GCN-NEXT: v_mul_f32_e32 v214, s4, v104 + ; GCN-NEXT: v_mul_f32_e32 v215, s4, v105 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[186:187], v[130:131], v[80:95] + ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215 + ; GCN-NEXT: v_mul_f32_e32 v214, s4, v106 + ; GCN-NEXT: v_mul_f32_e32 v215, s4, v107 + ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215 + ; GCN-NEXT: v_mul_f32_e32 v214, s4, v108 + ; GCN-NEXT: v_mul_f32_e32 v215, s4, v109 + ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[156:157], v[128:129], v[64:79] + ; GCN-NEXT: v_mul_f32_e32 v214, s4, v110 + ; GCN-NEXT: v_mul_f32_e32 v215, s4, v111 + ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215 + ; GCN-NEXT: v_mul_f32_e32 v140, s4, v80 + ; GCN-NEXT: v_mul_f32_e32 v141, s4, v81 + ; GCN-NEXT: v_max3_f32 v140, v213, v140, v141 + ; GCN-NEXT: v_mul_f32_e32 v141, s4, v82 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[158:159], v[130:131], v[64:79] + ; GCN-NEXT: v_mul_f32_e32 v142, s4, v83 + ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142 + ; GCN-NEXT: v_mul_f32_e32 v141, s4, v84 + ; GCN-NEXT: v_mul_f32_e32 v142, s4, v85 + ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142 + ; GCN-NEXT: v_mul_f32_e32 v141, s4, v86 + ; GCN-NEXT: v_mul_f32_e32 v142, s4, v87 + ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142 + ; GCN-NEXT: v_mul_f32_e32 v141, s4, v88 + ; GCN-NEXT: v_mul_f32_e32 v142, s4, v89 + ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142 + ; GCN-NEXT: v_mul_f32_e32 v141, s4, v90 + ; GCN-NEXT: v_mul_f32_e32 v142, s4, v91 + ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142 + ; GCN-NEXT: v_mul_f32_e32 v141, s4, v92 + ; GCN-NEXT: v_mul_f32_e32 v142, s4, v93 + ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142 + ; GCN-NEXT: v_mul_f32_e32 v141, s4, v94 + ; GCN-NEXT: v_mul_f32_e32 v142, s4, v95 + ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142 + ; GCN-NEXT: v_mul_f32_e32 v128, s4, v64 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v65 + ; GCN-NEXT: v_max3_f32 v128, v140, v128, v129 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v66 + ; GCN-NEXT: v_mul_f32_e32 v130, s4, v67 + ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v68 + ; GCN-NEXT: v_mul_f32_e32 v130, s4, v69 + ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v70 + ; GCN-NEXT: v_mul_f32_e32 v130, s4, v71 + ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v72 + ; GCN-NEXT: v_mul_f32_e32 v130, s4, v73 + ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v74 + ; GCN-NEXT: v_mul_f32_e32 v130, s4, v75 + ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v76 + ; GCN-NEXT: v_mul_f32_e32 v130, s4, v77 + ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v78 + ; GCN-NEXT: v_mul_f32_e32 v130, s4, v79 + ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130 + ; GCN-NEXT: ds_bpermute_b32 v129, v196, v128 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: ds_read_b128 v[130:133], v198 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[134:137], v198 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_max_f32_e32 v129, v129, v129 + ; GCN-NEXT: v_max_f32_e32 v128, v128, v129 + ; GCN-NEXT: ds_bpermute_b32 v129, v196, v128 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: v_cndmask_b32_e64 v128, v129, v128, s[12:13] + ; GCN-NEXT: v_max_f32_e32 v128, v128, v128 + ; GCN-NEXT: v_max_f32_e32 v128, v212, v128 + ; GCN-NEXT: v_fma_f32 v113, s4, v113, -v128 + ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v113 + ; GCN-NEXT: v_fma_f32 v113, s4, v114, -v128 + ; GCN-NEXT: v_mul_f32_e32 v139, 0x3fb8aa3b, v113 + ; GCN-NEXT: v_fma_f32 v113, s4, v115, -v128 + ; GCN-NEXT: v_mul_f32_e32 v140, 0x3fb8aa3b, v113 + ; GCN-NEXT: v_fma_f32 v113, s4, v116, -v128 + ; GCN-NEXT: v_mul_f32_e32 v141, 0x3fb8aa3b, v113 + ; GCN-NEXT: v_fma_f32 v113, s4, v117, -v128 + ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v113 + ; GCN-NEXT: v_fma_f32 v113, s4, v118, -v128 + ; GCN-NEXT: v_fma_f32 v112, s4, v112, -v128 + ; GCN-NEXT: v_mul_f32_e32 v143, 0x3fb8aa3b, v113 + ; GCN-NEXT: v_fma_f32 v113, s4, v119, -v128 + ; GCN-NEXT: v_fma_f32 v118, s4, v120, -v128 + ; GCN-NEXT: v_fma_f32 v120, s4, v121, -v128 + ; GCN-NEXT: v_mul_f32_e32 v112, 0x3fb8aa3b, v112 + ; GCN-NEXT: v_mul_f32_e32 v144, 0x3fb8aa3b, v113 + ; GCN-NEXT: v_mul_f32_e32 v149, 0x3fb8aa3b, v120 + ; GCN-NEXT: v_fma_f32 v120, s4, v122, -v128 + ; GCN-NEXT: v_exp_f32_e32 v114, v138 + ; GCN-NEXT: v_exp_f32_e32 v115, v139 + ; GCN-NEXT: v_exp_f32_e32 v116, v140 + ; GCN-NEXT: v_exp_f32_e32 v117, v141 + ; GCN-NEXT: v_mul_f32_e32 v148, 0x3fb8aa3b, v118 + ; GCN-NEXT: v_exp_f32_e32 v118, v142 + ; GCN-NEXT: v_mul_f32_e32 v150, 0x3fb8aa3b, v120 + ; GCN-NEXT: v_exp_f32_e32 v120, v144 + ; GCN-NEXT: v_exp_f32_e32 v113, v112 + ; GCN-NEXT: v_cvt_f16_f32_e32 v119, v114 + ; GCN-NEXT: v_cvt_f16_f32_e32 v121, v116 + ; GCN-NEXT: v_sub_f32_e32 v129, v211, v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v112, v113 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v129 + ; GCN-NEXT: ds_read_b128 v[138:141], v198 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_fma_f32 v122, s4, v123, -v128 + ; GCN-NEXT: v_pack_b32_f16 v146, v112, v119 + ; GCN-NEXT: v_cvt_f16_f32_e32 v112, v115 + ; GCN-NEXT: v_mul_f32_e32 v151, 0x3fb8aa3b, v122 + ; GCN-NEXT: v_cvt_f16_f32_e32 v123, v117 + ; GCN-NEXT: v_fma_f32 v122, s4, v124, -v128 + ; GCN-NEXT: v_pack_b32_f16 v147, v112, v121 + ; GCN-NEXT: v_exp_f32_e32 v112, v129 + ; GCN-NEXT: v_cvt_f16_f32_e32 v124, v118 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v122 + ; GCN-NEXT: v_fma_f32 v125, s4, v125, -v128 + ; GCN-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[4:5], v[4:5], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[6:7], v[6:7], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[8:9], v[8:9], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[10:11], v[10:11], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[12:13], v[12:13], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[14:15], v[14:15], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[32:33], v[32:33], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[34:35], v[34:35], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[146:147], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v119, v143 + ; GCN-NEXT: ds_read_b128 v[142:145], v198 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_pk_mul_f32 v[36:37], v[36:37], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[38:39], v[38:39], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[40:41], v[40:41], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[42:43], v[42:43], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[44:45], v[44:45], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[46:47], v[46:47], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[16:17], v[16:17], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[18:19], v[18:19], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[134:135], v[146:147], v[32:47] + ; GCN-NEXT: v_mul_f32_e64 v20, v20, v112 + ; GCN-NEXT: v_mul_f32_e64 v21, v21, v112 + ; GCN-NEXT: v_mul_f32_e64 v22, v22, v112 + ; GCN-NEXT: v_mul_f32_e64 v23, v23, v112 + ; GCN-NEXT: v_mul_f32_e64 v24, v24, v112 + ; GCN-NEXT: v_mul_f32_e64 v25, v25, v112 + ; GCN-NEXT: v_pk_mul_f32 v[26:27], v[26:27], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[28:29], v[28:29], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[30:31], v[30:31], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[48:49], v[48:49], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[50:51], v[50:51], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[52:53], v[52:53], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[54:55], v[54:55], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[56:57], v[56:57], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[58:59], v[58:59], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[60:61], v[60:61], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[62:63], v[62:63], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pack_b32_f16 v134, v123, v124 + ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v119 + ; GCN-NEXT: v_fma_f32 v124, s4, v126, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v120 + ; GCN-NEXT: v_exp_f32_e32 v121, v148 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[138:139], v[146:147], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v122, v149 + ; GCN-NEXT: v_pack_b32_f16 v135, v130, v126 + ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v124 + ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v121 + ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v125 + ; GCN-NEXT: v_fma_f32 v139, s4, v96, -v128 + ; GCN-NEXT: v_fma_f32 v127, s4, v127, -v128 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[142:143], v[146:147], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v123, v150 + ; GCN-NEXT: v_mul_f32_e32 v127, 0x3fb8aa3b, v127 + ; GCN-NEXT: v_fma_f32 v143, s4, v101, -v128 + ; GCN-NEXT: v_fma_f32 v64, s4, v64, -v128 + ; GCN-NEXT: v_fma_f32 v65, s4, v65, -v128 + ; GCN-NEXT: v_fma_f32 v68, s4, v68, -v128 + ; GCN-NEXT: v_fma_f32 v69, s4, v69, -v128 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[134:135], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v124, v151 + ; GCN-NEXT: ds_read_b128 v[130:133], v197 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[146:149], v197 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[136:137], v[134:135], v[32:47] + ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v122 + ; GCN-NEXT: v_exp_f32_e32 v96, v129 + ; GCN-NEXT: v_fma_f32 v137, s4, v97, -v128 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v139 + ; GCN-NEXT: v_pack_b32_f16 v126, v126, v136 + ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v123 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[140:141], v[134:135], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v97, v125 + ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v137 + ; GCN-NEXT: v_fma_f32 v137, s4, v98, -v128 + ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v137 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[144:145], v[134:135], v[48:63] + ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v124 + ; GCN-NEXT: v_fma_f32 v135, s4, v99, -v128 + ; GCN-NEXT: v_exp_f32_e32 v98, v138 + ; GCN-NEXT: v_exp_f32_e32 v99, v127 + ; GCN-NEXT: v_mul_f32_e32 v150, 0x3fb8aa3b, v135 + ; GCN-NEXT: v_pack_b32_f16 v127, v136, v134 + ; GCN-NEXT: ds_read_b128 v[134:137], v197 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[138:141], v197 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[126:127], v[0:15] + ; GCN-NEXT: v_fma_f32 v131, s4, v100, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v96 + ; GCN-NEXT: v_exp_f32_e32 v100, v129 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v131 + ; GCN-NEXT: v_cvt_f16_f32_e32 v131, v97 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b64 v135, v[36:37] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[148:149], v[96:111] - ; GCN-NEXT: v_perm_b32 v32, v159, v157, s5 - ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v150 - ; GCN-NEXT: v_cvt_f16_f32_e32 v150, v151 - ; GCN-NEXT: v_fma_f32 v157, s4, v38, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v38, v153 - ; GCN-NEXT: v_exp_f32_e32 v159, v33 - ; GCN-NEXT: v_perm_b32 v33, v131, v129, s5 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[144:145], v[148:149], v[112:127] - ; GCN-NEXT: v_pack_b32_f16 v129, v150, v38 - ; GCN-NEXT: v_mul_f32_e32 v38, 0x3fb8aa3b, v152 - ; GCN-NEXT: v_exp_f32_e32 v152, v38 + ; GCN-NEXT: ds_write_b64 v199, v[188:189] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v136, v[60:61] + ; GCN-NEXT: ds_write_b64 v200, v[190:191] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v137, v[32:33] - ; GCN-NEXT: ; implicit-def: $vgpr33 - ; GCN-NEXT: ; implicit-def: $vgpr38 + ; GCN-NEXT: ds_write_b64 v201, v[192:193] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v138, v[140:141] - ; GCN-NEXT: v_add_u32_e32 v38, v132, v38 - ; GCN-NEXT: v_add_u32_e32 v33, v132, v33 + ; GCN-NEXT: ds_write_b64 v202, v[194:195] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[126:127], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v101, v125 + ; GCN-NEXT: v_pack_b32_f16 v146, v130, v131 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v38, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v210, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[140:141], v33, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v143 + ; GCN-NEXT: v_cvt_f16_f32_e32 v147, v98 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[126:127], v[16:31] + ; GCN-NEXT: v_fma_f32 v134, s4, v102, -v128 + ; GCN-NEXT: v_mul_f32_e32 v156, 0x3fb8aa3b, v134 + ; GCN-NEXT: buffer_load_dwordx2 v[134:135], v207, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ; implicit-def: $vgpr36 - ; GCN-NEXT: v_add_u32_e32 v33, v132, v36 - ; GCN-NEXT: ; implicit-def: $vgpr37 - ; GCN-NEXT: buffer_load_dwordx2 v[144:145], v33, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: v_exp_f32_e32 v102, v142 + ; GCN-NEXT: buffer_load_dwordx2 v[142:143], v208, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_add_u32_e32 v33, v132, v37 - ; GCN-NEXT: buffer_load_dwordx2 v[148:149], v33, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[144:145], v209, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_cvt_f16_f32_e32 v156, v162 - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v155 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v165 - ; GCN-NEXT: v_pack_b32_f16 v128, v154, v156 - ; GCN-NEXT: v_fma_f32 v150, s4, v39, -v134 - ; GCN-NEXT: ds_read_b128 v[36:39], v139 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[62:63], v[128:129], v[64:79] - ; GCN-NEXT: v_exp_f32_e32 v154, v32 - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v158 - ; GCN-NEXT: ds_read_b128 v[60:63], v139 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v156, s4, v42, -v134 - ; GCN-NEXT: v_perm_b32 v20, v140, v130, s5 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[142:143], v[128:129], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v155, v32 - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v157 - ; GCN-NEXT: v_cvt_f16_f32_e32 v142, v161 - ; GCN-NEXT: v_fma_f32 v143, s4, v41, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[128:129], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v159 - ; GCN-NEXT: v_exp_f32_e32 v157, v32 - ; GCN-NEXT: v_cvt_f16_f32_e32 v32, v152 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[146:147], v[128:129], v[112:127] - ; GCN-NEXT: v_pack_b32_f16 v129, v34, v32 - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v150 - ; GCN-NEXT: v_pack_b32_f16 v128, v33, v142 - ; GCN-NEXT: v_exp_f32_e32 v146, v32 - ; GCN-NEXT: ds_read_b128 v[32:35], v139 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v142, s4, v43, -v134 - ; GCN-NEXT: v_fma_f32 v150, s4, v46, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[128:129], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v40 - ; GCN-NEXT: ds_read_b128 v[40:43], v139 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v147, v36 - ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v143 - ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v154 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[60:61], v[128:129], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v143, v36 - ; GCN-NEXT: v_cvt_f16_f32_e32 v60, v155 - ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v142 - ; GCN-NEXT: v_fma_f32 v61, s4, v45, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[128:129], v[96:111] - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v156 - ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v157 - ; GCN-NEXT: v_exp_f32_e32 v156, v32 - ; GCN-NEXT: v_cvt_f16_f32_e32 v32, v146 - ; GCN-NEXT: v_pack_b32_f16 v33, v33, v32 - ; GCN-NEXT: v_pack_b32_f16 v32, v37, v60 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[40:41], v[128:129], v[112:127] - ; GCN-NEXT: v_exp_f32_e32 v129, v36 - ; GCN-NEXT: v_mul_f32_e32 v40, 0x3fb8aa3b, v44 - ; GCN-NEXT: v_cvt_f16_f32_e32 v60, v147 - ; GCN-NEXT: v_fma_f32 v128, s4, v47, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[38:39], v[32:33], v[64:79] - ; GCN-NEXT: ds_read_b128 v[36:39], v57 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v142, v40 - ; GCN-NEXT: v_mul_f32_e32 v40, 0x3fb8aa3b, v61 - ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v143 - ; GCN-NEXT: ds_read_b128 v[44:47], v57 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[62:63], v[32:33], v[80:95] - ; GCN-NEXT: v_fma_f32 v62, s4, v17, -v134 - ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v150 - ; GCN-NEXT: v_exp_f32_e32 v63, v40 - ; GCN-NEXT: v_pack_b32_f16 v40, v60, v61 - ; GCN-NEXT: v_fma_f32 v150, s4, v18, -v134 - ; GCN-NEXT: v_fma_f32 v60, s4, v19, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v142 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[32:33], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v156 - ; GCN-NEXT: v_exp_f32_e32 v158, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v129 - ; GCN-NEXT: v_pack_b32_f16 v41, v34, v17 - ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v128 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[42:43], v[32:33], v[112:127] - ; GCN-NEXT: v_exp_f32_e32 v128, v17 - ; GCN-NEXT: v_perm_b32 v42, v141, v131, s8 - ; GCN-NEXT: v_perm_b32 v43, v149, v145, s8 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[40:41], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v16 - ; GCN-NEXT: ds_read_b128 v[16:19], v57 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[32:35], v57 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v37, 0x3fb8aa3b, v62 - ; GCN-NEXT: v_exp_f32_e32 v167, v36 - ; GCN-NEXT: v_perm_b32 v36, v140, v130, s8 - ; GCN-NEXT: v_fma_f32 v62, s4, v21, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[44:45], v[40:41], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v130, v37 - ; GCN-NEXT: v_cvt_f16_f32_e32 v45, v158 - ; GCN-NEXT: v_perm_b32 v21, v148, v144, s5 - ; GCN-NEXT: v_perm_b32 v37, v148, v144, s8 - ; GCN-NEXT: v_cvt_f16_f32_e32 v44, v63 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[126:127], v[48:63] + ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v99 + ; GCN-NEXT: v_fma_f32 v127, s4, v103, -v128 + ; GCN-NEXT: v_exp_f32_e32 v103, v150 + ; GCN-NEXT: v_fma_f32 v139, s4, v105, -v128 + ; GCN-NEXT: v_pack_b32_f16 v147, v147, v126 + ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v127 + ; GCN-NEXT: v_perm_b32 v152, v135, v131, s5 + ; GCN-NEXT: v_perm_b32 v154, v135, v131, s7 + ; GCN-NEXT: v_fma_f32 v135, s4, v104, -v128 + ; GCN-NEXT: v_perm_b32 v126, v134, v130, s5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[146:147], v[0:15] + ; GCN-NEXT: v_perm_b32 v150, v134, v130, s7 + ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v100 + ; GCN-NEXT: v_exp_f32_e32 v104, v129 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v135 + ; GCN-NEXT: v_cvt_f16_f32_e32 v135, v101 + ; GCN-NEXT: ds_read_b128 v[130:133], v198 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_perm_b32 v127, v144, v142, s5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[146:147], v[32:47] + ; GCN-NEXT: v_pack_b32_f16 v148, v134, v135 + ; GCN-NEXT: v_fma_f32 v135, s4, v106, -v128 + ; GCN-NEXT: v_exp_f32_e32 v105, v125 + ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v102 + ; GCN-NEXT: v_perm_b32 v151, v144, v142, s7 + ; GCN-NEXT: v_perm_b32 v153, v145, v143, s5 + ; GCN-NEXT: v_perm_b32 v155, v145, v143, s7 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[146:147], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v106, v156 + ; GCN-NEXT: v_mul_f32_e32 v156, 0x3fb8aa3b, v135 + ; GCN-NEXT: v_cvt_f16_f32_e32 v135, v103 + ; GCN-NEXT: v_fma_f32 v136, s4, v107, -v128 + ; GCN-NEXT: ds_read_b128 v[142:145], v198 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v139 + ; GCN-NEXT: v_pack_b32_f16 v149, v134, v135 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[146:147], v[48:63] + ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v136 + ; GCN-NEXT: ds_read_b128 v[134:137], v198 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_exp_f32_e32 v107, v138 + ; GCN-NEXT: ds_read_b128 v[138:141], v198 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[148:149], v[0:15] + ; GCN-NEXT: v_fma_f32 v131, s4, v108, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v104 + ; GCN-NEXT: v_exp_f32_e32 v108, v129 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v131 + ; GCN-NEXT: v_cvt_f16_f32_e32 v131, v105 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[142:143], v[148:149], v[32:47] + ; GCN-NEXT: v_fma_f32 v142, s4, v109, -v128 + ; GCN-NEXT: v_exp_f32_e32 v109, v125 + ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v142 + ; GCN-NEXT: v_pack_b32_f16 v142, v130, v131 + ; GCN-NEXT: v_fma_f32 v131, s4, v110, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v106 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[148:149], v[16:31] + ; GCN-NEXT: v_mul_f32_e32 v134, 0x3fb8aa3b, v131 + ; GCN-NEXT: v_cvt_f16_f32_e32 v131, v107 + ; GCN-NEXT: v_exp_f32_e32 v110, v156 + ; GCN-NEXT: v_fma_f32 v135, s4, v111, -v128 + ; GCN-NEXT: v_mul_f32_e32 v135, 0x3fb8aa3b, v135 + ; GCN-NEXT: v_pack_b32_f16 v143, v130, v131 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[148:149], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v111, v146 + ; GCN-NEXT: v_fma_f32 v139, s4, v80, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v138, v108 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[142:143], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v80, v129 + ; GCN-NEXT: ds_read_b128 v[130:133], v197 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[146:149], v197 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v139 + ; GCN-NEXT: v_cvt_f16_f32_e32 v139, v109 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[144:145], v[142:143], v[32:47] + ; GCN-NEXT: v_fma_f32 v144, s4, v81, -v128 + ; GCN-NEXT: v_exp_f32_e32 v81, v125 + ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v144 + ; GCN-NEXT: v_pack_b32_f16 v144, v138, v139 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[142:143], v[16:31] + ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v110 + ; GCN-NEXT: v_fma_f32 v137, s4, v82, -v128 + ; GCN-NEXT: v_exp_f32_e32 v82, v134 + ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v111 + ; GCN-NEXT: v_mul_f32_e32 v156, 0x3fb8aa3b, v137 + ; GCN-NEXT: v_fma_f32 v137, s4, v83, -v128 + ; GCN-NEXT: v_mul_f32_e32 v157, 0x3fb8aa3b, v137 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[142:143], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v83, v135 + ; GCN-NEXT: v_pack_b32_f16 v145, v136, v134 + ; GCN-NEXT: ds_read_b128 v[134:137], v197 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[138:141], v197 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b64 v135, v[20:21] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[40:41], v[96:111] - ; GCN-NEXT: v_perm_b32 v16, v141, v131, s5 - ; GCN-NEXT: v_fma_f32 v131, s4, v22, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v128 - ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v150 - ; GCN-NEXT: v_exp_f32_e32 v140, v17 - ; GCN-NEXT: v_perm_b32 v17, v149, v145, s5 + ; GCN-NEXT: ds_write_b64 v199, v[126:127] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v136, v[36:37] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[32:33], v[40:41], v[112:127] - ; GCN-NEXT: v_pack_b32_f16 v33, v45, v22 - ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v60 - ; GCN-NEXT: v_exp_f32_e32 v144, v22 + ; GCN-NEXT: ds_write_b64 v200, v[150:151] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[144:145], v[0:15] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v137, v[16:17] - ; GCN-NEXT: ; implicit-def: $vgpr17 - ; GCN-NEXT: ; implicit-def: $vgpr22 + ; GCN-NEXT: ds_write_b64 v201, v[152:153] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v138, v[42:43] - ; GCN-NEXT: v_add_u32_e32 v22, v132, v22 - ; GCN-NEXT: v_add_u32_e32 v17, v132, v17 - ; GCN-NEXT: ; implicit-def: $vgpr20 - ; GCN-NEXT: ; implicit-def: $vgpr21 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx2 v[40:41], v22, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: ds_write_b64 v202, v[154:155] + ; GCN-NEXT: v_fma_f32 v127, s4, v84, -v128 + ; GCN-NEXT: v_exp_f32_e32 v84, v129 + ; GCN-NEXT: v_fma_f32 v130, s4, v85, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v80 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v127 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[144:145], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v85, v125 + ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v130 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v206, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[42:43], v17, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: v_cvt_f16_f32_e32 v127, v81 + ; GCN-NEXT: v_pack_b32_f16 v126, v126, v127 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[144:145], v[16:31] + ; GCN-NEXT: v_fma_f32 v134, s4, v86, -v128 + ; GCN-NEXT: v_mul_f32_e32 v158, 0x3fb8aa3b, v134 + ; GCN-NEXT: buffer_load_dwordx2 v[134:135], v203, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_add_u32_e32 v20, v132, v20 - ; GCN-NEXT: v_add_u32_e32 v21, v132, v21 - ; GCN-NEXT: v_pack_b32_f16 v32, v61, v44 - ; GCN-NEXT: buffer_load_dwordx2 v[44:45], v20, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[142:143], v204, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[60:61], v21, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[146:147], v205, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v166 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[38:39], v[32:33], v[64:79] - ; GCN-NEXT: v_exp_f32_e32 v132, v16 - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v62 + ; GCN-NEXT: v_cvt_f16_f32_e32 v127, v82 + ; GCN-NEXT: v_exp_f32_e32 v86, v156 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[144:145], v[48:63] + ; GCN-NEXT: v_cvt_f16_f32_e32 v138, v83 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v167 - ; GCN-NEXT: v_fma_f32 v141, s4, v23, -v134 - ; GCN-NEXT: ds_read_b128 v[20:23], v139 + ; GCN-NEXT: v_fma_f32 v139, s4, v87, -v128 + ; GCN-NEXT: v_exp_f32_e32 v87, v157 + ; GCN-NEXT: v_pack_b32_f16 v127, v127, v138 + ; GCN-NEXT: v_fma_f32 v138, s4, v89, -v128 + ; GCN-NEXT: v_mul_f32_e32 v139, 0x3fb8aa3b, v139 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[126:127], v[0:15] + ; GCN-NEXT: ; implicit-def: $sgpr0 + ; GCN-NEXT: v_perm_b32 v154, v135, v131, s5 + ; GCN-NEXT: v_perm_b32 v156, v135, v131, s7 + ; GCN-NEXT: v_fma_f32 v135, s4, v88, -v128 + ; GCN-NEXT: v_perm_b32 v150, v134, v130, s5 + ; GCN-NEXT: v_perm_b32 v152, v134, v130, s7 + ; GCN-NEXT: ds_read_b128 v[130:133], v198 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v84 + ; GCN-NEXT: v_exp_f32_e32 v88, v129 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v135 + ; GCN-NEXT: v_cvt_f16_f32_e32 v135, v85 + ; GCN-NEXT: v_perm_b32 v151, v146, v142, s5 + ; GCN-NEXT: v_perm_b32 v153, v146, v142, s7 + ; GCN-NEXT: v_perm_b32 v155, v147, v143, s5 + ; GCN-NEXT: v_perm_b32 v157, v147, v143, s7 + ; GCN-NEXT: ds_read_b128 v[142:145], v198 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[126:127], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v89, v125 + ; GCN-NEXT: v_pack_b32_f16 v146, v134, v135 + ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v86 + ; GCN-NEXT: v_fma_f32 v135, s4, v90, -v128 + ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v138 + ; GCN-NEXT: v_mul_f32_e32 v148, 0x3fb8aa3b, v135 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[126:127], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v90, v158 + ; GCN-NEXT: v_mul_f32_e32 v158, 0x3fb8aa3b, v64 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[126:127], v[48:63] + ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v87 + ; GCN-NEXT: v_fma_f32 v127, s4, v91, -v128 + ; GCN-NEXT: v_exp_f32_e32 v91, v139 + ; GCN-NEXT: v_mul_f32_e32 v127, 0x3fb8aa3b, v127 + ; GCN-NEXT: v_pack_b32_f16 v147, v134, v126 + ; GCN-NEXT: ds_read_b128 v[134:137], v198 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[138:141], v198 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[146:147], v[0:15] + ; GCN-NEXT: v_fma_f32 v130, s4, v92, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v88 + ; GCN-NEXT: v_exp_f32_e32 v92, v129 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v130 + ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v89 + ; GCN-NEXT: v_fma_f32 v131, s4, v93, -v128 + ; GCN-NEXT: v_pack_b32_f16 v130, v126, v130 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[142:143], v[146:147], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v93, v125 + ; GCN-NEXT: v_fma_f32 v126, s4, v94, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v125, v90 + ; GCN-NEXT: v_mul_f32_e32 v143, 0x3fb8aa3b, v126 + ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v91 + ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v131 + ; GCN-NEXT: v_fma_f32 v131, s4, v95, -v128 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[146:147], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v94, v148 + ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v93 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[146:147], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v95, v127 + ; GCN-NEXT: v_cvt_f16_f32_e32 v127, v92 + ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v131 + ; GCN-NEXT: v_pack_b32_f16 v131, v125, v126 + ; GCN-NEXT: s_nop 1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[130:131], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v125, v129 + ; GCN-NEXT: ds_read_b128 v[132:135], v197 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[36:39], v139 offset:576 + ; GCN-NEXT: ds_read_b128 v[146:149], v197 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[144:145], v[130:131], v[32:47] + ; GCN-NEXT: v_mul_f32_e32 v144, 0x3fb8aa3b, v65 + ; GCN-NEXT: v_fma_f32 v65, s4, v66, -v128 + ; GCN-NEXT: v_exp_f32_e32 v126, v142 + ; GCN-NEXT: v_pack_b32_f16 v142, v127, v64 + ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v94 + ; GCN-NEXT: v_mul_f32_e32 v145, 0x3fb8aa3b, v65 + ; GCN-NEXT: v_cvt_f16_f32_e32 v65, v95 + ; GCN-NEXT: v_fma_f32 v66, s4, v67, -v128 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[130:131], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v127, v143 + ; GCN-NEXT: v_pack_b32_f16 v143, v64, v65 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[130:131], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v129, v138 + ; GCN-NEXT: v_mul_f32_e32 v141, 0x3fb8aa3b, v66 + ; GCN-NEXT: ds_read_b128 v[64:67], v197 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[136:139], v197 offset:1728 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[46:47], v[32:33], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v62, v16 - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v131 - ; GCN-NEXT: v_cvt_f16_f32_e32 v46, v130 - ; GCN-NEXT: v_fma_f32 v47, s4, v25, -v134 - ; GCN-NEXT: v_fma_f32 v131, s4, v26, -v134 - ; GCN-NEXT: v_fma_f32 v149, s4, v4, -v134 - ; GCN-NEXT: ; implicit-def: $sgpr0 - ; GCN-NEXT: v_perm_b32 v4, v42, v40, s5 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[18:19], v[32:33], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v140 - ; GCN-NEXT: v_exp_f32_e32 v145, v16 - ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v144 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[34:35], v[32:33], v[112:127] - ; GCN-NEXT: v_pack_b32_f16 v33, v18, v16 - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v141 - ; GCN-NEXT: v_pack_b32_f16 v32, v17, v46 - ; GCN-NEXT: v_exp_f32_e32 v35, v16 - ; GCN-NEXT: ds_read_b128 v[16:19], v139 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v34, s4, v27, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[20:21], v[32:33], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v24 - ; GCN-NEXT: ds_read_b128 v[24:27], v139 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v46, v20 - ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v47 - ; GCN-NEXT: v_cvt_f16_f32_e32 v21, v132 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[36:37], v[32:33], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v47, v20 - ; GCN-NEXT: v_cvt_f16_f32_e32 v36, v62 - ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v34 - ; GCN-NEXT: v_fma_f32 v37, s4, v29, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v46 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[32:33], v[96:111] - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v131 - ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v145 - ; GCN-NEXT: v_exp_f32_e32 v141, v16 - ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v35 - ; GCN-NEXT: v_fma_f32 v131, s4, v30, -v134 - ; GCN-NEXT: v_pack_b32_f16 v17, v17, v16 - ; GCN-NEXT: v_pack_b32_f16 v16, v21, v36 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[24:25], v[32:33], v[112:127] - ; GCN-NEXT: v_exp_f32_e32 v33, v20 - ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v28 - ; GCN-NEXT: v_fma_f32 v32, s4, v31, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79] - ; GCN-NEXT: ds_read_b128 v[20:23], v57 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v36, v24 - ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v37 - ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v47 - ; GCN-NEXT: ds_read_b128 v[28:31], v57 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[38:39], v[16:17], v[80:95] - ; GCN-NEXT: v_fma_f32 v38, s4, v1, -v134 - ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v131 - ; GCN-NEXT: v_exp_f32_e32 v39, v24 - ; GCN-NEXT: v_pack_b32_f16 v24, v34, v37 - ; GCN-NEXT: v_fma_f32 v131, s4, v2, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v36 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[18:19], v[16:17], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v141 - ; GCN-NEXT: v_exp_f32_e32 v148, v1 - ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v33 - ; GCN-NEXT: v_pack_b32_f16 v25, v18, v1 - ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v32 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[26:27], v[16:17], v[112:127] - ; GCN-NEXT: v_fma_f32 v32, s4, v3, -v134 - ; GCN-NEXT: v_exp_f32_e32 v34, v1 - ; GCN-NEXT: v_perm_b32 v26, v43, v41, s8 - ; GCN-NEXT: v_perm_b32 v27, v61, v45, s8 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[20:21], v[24:25], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v0 - ; GCN-NEXT: ds_read_b128 v[0:3], v57 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[16:19], v57 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v21, 0x3fb8aa3b, v38 - ; GCN-NEXT: v_exp_f32_e32 v150, v20 - ; GCN-NEXT: v_perm_b32 v20, v42, v40, s8 - ; GCN-NEXT: v_cvt_f16_f32_e32 v40, v148 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[28:29], v[24:25], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v38, v21 - ; GCN-NEXT: v_cvt_f16_f32_e32 v28, v39 - ; GCN-NEXT: v_fma_f32 v29, s4, v5, -v134 - ; GCN-NEXT: v_perm_b32 v5, v60, v44, s5 - ; GCN-NEXT: v_perm_b32 v21, v60, v44, s8 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b64 v135, v[4:5] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[24:25], v[96:111] - ; GCN-NEXT: v_perm_b32 v0, v43, v41, s5 - ; GCN-NEXT: v_fma_f32 v41, s4, v6, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v6, v34 - ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v131 - ; GCN-NEXT: v_exp_f32_e32 v42, v1 - ; GCN-NEXT: v_perm_b32 v1, v61, v45, s5 + ; GCN-NEXT: ds_write_b64 v199, v[150:151] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v136, v[20:21] + ; GCN-NEXT: ds_write_b64 v200, v[152:153] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[142:143], v[0:15] + ; GCN-NEXT: v_cvt_f16_f32_e32 v132, v125 + ; GCN-NEXT: v_exp_f32_e32 v130, v158 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v137, v[0:1] + ; GCN-NEXT: ds_write_b64 v201, v[154:155] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v138, v[26:27] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[16:17], v[24:25], v[112:127] - ; GCN-NEXT: v_pack_b32_f16 v17, v40, v6 - ; GCN-NEXT: v_mul_f32_e32 v6, 0x3fb8aa3b, v32 + ; GCN-NEXT: ds_write_b64 v202, v[156:157] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_pack_b32_f16 v16, v37, v28 - ; GCN-NEXT: v_fma_f32 v24, s4, v7, -v134 - ; GCN-NEXT: v_exp_f32_e32 v25, v6 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_read_b128 v[4:7], v139 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v149 - ; GCN-NEXT: v_exp_f32_e32 v26, v0 - ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v29 - ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v150 - ; GCN-NEXT: v_cvt_f16_f32_e32 v27, v38 - ; GCN-NEXT: ds_read_b128 v[20:23], v139 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v28, s4, v9, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[30:31], v[16:17], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v29, v0 - ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v41 - ; GCN-NEXT: v_fma_f32 v30, s4, v10, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[2:3], v[16:17], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v2, v42 - ; GCN-NEXT: v_exp_f32_e32 v31, v0 - ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[18:19], v[16:17], v[112:127] - ; GCN-NEXT: v_pack_b32_f16 v17, v2, v0 - ; GCN-NEXT: v_pack_b32_f16 v16, v1, v27 - ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v24 - ; GCN-NEXT: v_fma_f32 v18, s4, v11, -v134 - ; GCN-NEXT: v_exp_f32_e32 v19, v0 - ; GCN-NEXT: ds_read_b128 v[0:3], v139 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[4:5], v[16:17], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v8 - ; GCN-NEXT: ds_read_b128 v[8:11], v139 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v24, v4 - ; GCN-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v28 - ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v26 - ; GCN-NEXT: v_exp_f32_e32 v27, v4 - ; GCN-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v18 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[20:21], v[16:17], v[80:95] - ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v29 - ; GCN-NEXT: v_fma_f32 v21, s4, v13, -v134 - ; GCN-NEXT: v_fma_f32 v28, s4, v14, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[16:17], v[96:111] - ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v30 - ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v31 - ; GCN-NEXT: v_exp_f32_e32 v30, v0 - ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 - ; GCN-NEXT: v_pack_b32_f16 v1, v1, v0 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[8:9], v[16:17], v[112:127] - ; GCN-NEXT: v_exp_f32_e32 v16, v4 - ; GCN-NEXT: v_pack_b32_f16 v0, v5, v20 - ; GCN-NEXT: v_mul_f32_e32 v9, 0x3fb8aa3b, v12 - ; GCN-NEXT: v_exp_f32_e32 v18, v9 - ; GCN-NEXT: v_mul_f32_e32 v9, 0x3fb8aa3b, v21 - ; GCN-NEXT: v_exp_f32_e32 v21, v9 - ; GCN-NEXT: v_fma_f32 v8, s4, v15, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[0:1], v[64:79] - ; GCN-NEXT: ds_read_b128 v[4:7], v57 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[12:15], v57 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v24 - ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v27 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[22:23], v[0:1], v[80:95] - ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v21 - ; GCN-NEXT: v_cvt_f16_f32_e32 v23, v18 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[2:3], v[0:1], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v3, v30 - ; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v28 - ; GCN-NEXT: v_exp_f32_e32 v2, v2 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[10:11], v[0:1], v[112:127] - ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 - ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v8 - ; GCN-NEXT: v_exp_f32_e32 v10, v1 - ; GCN-NEXT: v_pack_b32_f16 v8, v17, v20 - ; GCN-NEXT: v_pack_b32_f16 v9, v3, v0 - ; GCN-NEXT: v_add_f32_e32 v3, 0, v49 - ; GCN-NEXT: v_add_f32_e32 v3, v50, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v51, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v52, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v53, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v54, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v55, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v56, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v58, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v163, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v164, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v59, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v160, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v162, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v151, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v153, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v165, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v161, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v159, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v152, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v154, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v155, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v157, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v146, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v147, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v143, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v156, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v129, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v142, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v63, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v158, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v128, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v167, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v130, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v140, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v144, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v132, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v62, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v145, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v35, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v46, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v47, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v141, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v33, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v36, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v39, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v148, v3 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[12:13], v[8:9], v[80:95] - ; GCN-NEXT: v_add_f32_e32 v3, v34, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v150, v3 - ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v10 - ; GCN-NEXT: v_cvt_f16_f32_e32 v11, v2 - ; GCN-NEXT: v_add_f32_e32 v3, v38, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v42, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v25, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v26, v3 - ; GCN-NEXT: v_pack_b32_f16 v1, v11, v1 - ; GCN-NEXT: v_pack_b32_f16 v0, v23, v22 - ; GCN-NEXT: v_add_f32_e32 v3, v29, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v31, v3 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[14:15], v[0:1], v[80:95] - ; GCN-NEXT: v_add_f32_e32 v3, v19, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v24, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v27, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v30, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v16, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v18, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v21, v3 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[4:5], v[8:9], v[64:79] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[0:1], v[64:79] - ; GCN-NEXT: v_add_f32_e32 v0, v2, v3 - ; GCN-NEXT: v_add_f32_e32 v4, v10, v0 - ; GCN-NEXT: ds_bpermute_b32 v5, v133, v4 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_read_b128 v[0:3], v57 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[142:143], v[32:47] + ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v68 + ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v126 + ; GCN-NEXT: v_exp_f32_e32 v131, v144 + ; GCN-NEXT: v_mul_f32_e32 v144, 0x3fb8aa3b, v69 + ; GCN-NEXT: v_fma_f32 v69, s4, v71, -v128 + ; GCN-NEXT: v_pack_b32_f16 v140, v132, v68 + ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v129 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[64:65], v[142:143], v[16:31] + ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v127 + ; GCN-NEXT: v_exp_f32_e32 v132, v145 + ; GCN-NEXT: v_fma_f32 v65, s4, v70, -v128 + ; GCN-NEXT: v_mul_f32_e32 v65, 0x3fb8aa3b, v65 + ; GCN-NEXT: v_fma_f32 v145, s4, v73, -v128 + ; GCN-NEXT: v_mul_f32_e32 v147, 0x3fb8aa3b, v145 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[136:137], v[142:143], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v133, v141 + ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v69 + ; GCN-NEXT: v_pack_b32_f16 v141, v64, v68 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: ds_read_b128 v[68:71], v198 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_fma_f32 v143, s4, v72, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v130 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[134:135], v[140:141], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v72, v146 + ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v143 + ; GCN-NEXT: v_cvt_f16_f32_e32 v143, v131 + ; GCN-NEXT: ds_read_b128 v[134:137], v198 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_pack_b32_f16 v64, v64, v143 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[140:141], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v73, v144 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[66:67], v[140:141], v[16:31] + ; GCN-NEXT: v_cvt_f16_f32_e32 v66, v132 + ; GCN-NEXT: v_fma_f32 v67, s4, v74, -v128 + ; GCN-NEXT: v_exp_f32_e32 v74, v65 + ; GCN-NEXT: v_cvt_f16_f32_e32 v65, v133 + ; GCN-NEXT: v_mul_f32_e32 v67, 0x3fb8aa3b, v67 + ; GCN-NEXT: v_pack_b32_f16 v65, v66, v65 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[140:141], v[48:63] + ; GCN-NEXT: v_fma_f32 v138, s4, v75, -v128 + ; GCN-NEXT: v_exp_f32_e32 v75, v142 + ; GCN-NEXT: v_mul_f32_e32 v148, 0x3fb8aa3b, v138 + ; GCN-NEXT: ds_read_b128 v[138:141], v198 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[142:145], v198 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_cvt_f16_f32_e32 v66, v72 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[64:65], v[0:15] + ; GCN-NEXT: v_fma_f32 v68, s4, v76, -v128 + ; GCN-NEXT: v_exp_f32_e32 v76, v146 + ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v68 + ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v73 + ; GCN-NEXT: v_fma_f32 v69, s4, v77, -v128 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[134:135], v[64:65], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v77, v147 + ; GCN-NEXT: v_pack_b32_f16 v134, v66, v68 + ; GCN-NEXT: v_fma_f32 v68, s4, v78, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v66, v74 + ; GCN-NEXT: v_mul_f32_e32 v147, 0x3fb8aa3b, v69 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[138:139], v[64:65], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v78, v67 + ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v68 + ; GCN-NEXT: v_cvt_f16_f32_e32 v139, v76 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[142:143], v[64:65], v[48:63] + ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v75 + ; GCN-NEXT: v_fma_f32 v65, s4, v79, -v128 + ; GCN-NEXT: v_exp_f32_e32 v79, v148 + ; GCN-NEXT: v_mul_f32_e32 v128, 0x3fb8aa3b, v65 + ; GCN-NEXT: v_pack_b32_f16 v135, v66, v64 + ; GCN-NEXT: s_nop 1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[70:71], v[134:135], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v142, v146 + ; GCN-NEXT: ds_read_b128 v[68:71], v197 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[64:67], v197 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[136:137], v[134:135], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v137, v147 + ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v77 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[140:141], v[134:135], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v138, v138 + ; GCN-NEXT: v_cvt_f16_f32_e32 v140, v78 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[144:145], v[134:135], v[48:63] + ; GCN-NEXT: s_nop 10 + ; GCN-NEXT: v_exp_f32_e32 v52, v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v50, v137 + ; GCN-NEXT: v_cvt_f16_f32_e32 v51, v142 + ; GCN-NEXT: v_cvt_f16_f32_e32 v54, v138 + ; GCN-NEXT: v_cvt_f16_f32_e32 v53, v52 + ; GCN-NEXT: v_cvt_f16_f32_e32 v49, v79 + ; GCN-NEXT: v_pack_b32_f16 v50, v51, v50 + ; GCN-NEXT: v_pack_b32_f16 v48, v139, v136 + ; GCN-NEXT: v_pack_b32_f16 v51, v54, v53 + ; GCN-NEXT: v_add_f32_e32 v53, 0, v113 + ; GCN-NEXT: v_add_f32_e32 v53, v114, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v115, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v116, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v117, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v118, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v119, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v120, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v121, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v122, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v123, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v124, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v96, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v97, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v98, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v99, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v100, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v101, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v102, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v103, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v104, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v105, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v106, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v107, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v108, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v109, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v110, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v111, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v80, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v81, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v82, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v83, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v84, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v85, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v86, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v87, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v88, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v89, v53 + ; GCN-NEXT: v_pack_b32_f16 v49, v140, v49 + ; GCN-NEXT: v_add_f32_e32 v53, v90, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v91, v53 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[48:49], v[0:15] + ; GCN-NEXT: v_add_f32_e32 v53, v92, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v93, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v94, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v95, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v125, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v126, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v127, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v129, v53 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[70:71], v[50:51], v[0:15] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[64:65], v[48:49], v[32:47] + ; GCN-NEXT: s_nop 9 + ; GCN-NEXT: v_add_f32_e32 v0, v130, v53 + ; GCN-NEXT: v_add_f32_e32 v0, v131, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v132, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v133, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v72, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v73, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v74, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v75, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v76, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v77, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v78, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v79, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v142, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v137, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v138, v0 + ; GCN-NEXT: v_add_f32_e32 v4, v52, v0 + ; GCN-NEXT: ds_bpermute_b32 v5, v196, v4 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: ds_read_b128 v[0:3], v197 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[0:1], v[48:49], v[16:31] ; GCN-NEXT: v_add_f32_e32 v2, v4, v5 - ; GCN-NEXT: ds_bpermute_b32 v3, v133, v2 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[8:9], v[96:111] - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[6:7] + ; GCN-NEXT: ds_bpermute_b32 v3, v196, v2 ; GCN-NEXT: ; implicit-def: $vgpr4 - ; GCN-NEXT: v_fmac_f32_e32 v0, v4, v48 - ; GCN-NEXT: ds_read_b128 v[0:3], v57 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[12:13] + ; GCN-NEXT: v_fmac_f32_e32 v0, v4, v112 + ; GCN-NEXT: ds_read_b128 v[0:3], v197 offset:1728 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[66:67], v[50:51], v[32:47] ; GCN-NEXT: s_endpgm attributes #0 = {"amdgpu-flat-work-group-size"="256,256"} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll index 7959cee49b93f..e174fc17e98fe 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll @@ -156,62 +156,62 @@ define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 1.0 -; GCN-NEXT: v_mov_b32_e32 v3, 2.0 +; GCN-NEXT: v_mov_b32_e32 v1, 2.0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_u32_e32 v1, s0, v0 -; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:112 -; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:96 -; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:80 -; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:64 -; GCN-NEXT: ds_read_b128 a[0:3], v1 -; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:16 -; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:32 -; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:48 +; GCN-NEXT: v_add_u32_e32 v3, s0, v0 +; GCN-NEXT: ds_read_b128 a[28:31], v3 offset:112 +; GCN-NEXT: ds_read_b128 a[24:27], v3 offset:96 +; GCN-NEXT: ds_read_b128 a[20:23], v3 offset:80 +; GCN-NEXT: ds_read_b128 a[16:19], v3 offset:64 +; GCN-NEXT: ds_read_b128 a[0:3], v3 +; GCN-NEXT: ds_read_b128 a[4:7], v3 offset:16 +; GCN-NEXT: ds_read_b128 a[8:11], v3 offset:32 +; GCN-NEXT: ds_read_b128 a[12:15], v3 offset:48 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; GCN-NEXT: ds_read_b128 a[156:159], v1 offset:8304 -; GCN-NEXT: ds_read_b128 a[152:155], v1 offset:8288 -; GCN-NEXT: ds_read_b128 a[148:151], v1 offset:8272 -; GCN-NEXT: ds_read_b128 a[144:147], v1 offset:8256 -; GCN-NEXT: ds_read_b128 a[140:143], v1 offset:8240 -; GCN-NEXT: ds_read_b128 a[136:139], v1 offset:8224 -; GCN-NEXT: ds_read_b128 a[132:135], v1 offset:8208 -; GCN-NEXT: ds_read_b128 a[128:131], v1 offset:8192 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] +; GCN-NEXT: ds_read_b128 a[156:159], v3 offset:8304 +; GCN-NEXT: ds_read_b128 a[152:155], v3 offset:8288 +; GCN-NEXT: ds_read_b128 a[148:151], v3 offset:8272 +; GCN-NEXT: ds_read_b128 a[144:147], v3 offset:8256 +; GCN-NEXT: ds_read_b128 a[140:143], v3 offset:8240 +; GCN-NEXT: ds_read_b128 a[136:139], v3 offset:8224 +; GCN-NEXT: ds_read_b128 a[132:135], v3 offset:8208 +; GCN-NEXT: ds_read_b128 a[128:131], v3 offset:8192 +; GCN-NEXT: v_add_u32_e32 v4, 0x6000, v3 ; GCN-NEXT: v_add_u32_e32 v0, s1, v0 ; GCN-NEXT: ; iglp_opt mask(0x00000001) ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v3, a[128:159] -; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:24688 -; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:24672 -; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:24656 -; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:24640 -; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:24624 -; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:24608 -; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:24592 -; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:24576 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v1, a[128:159] +; GCN-NEXT: ds_read_b128 a[124:127], v3 offset:24688 +; GCN-NEXT: ds_read_b128 a[120:123], v3 offset:24672 +; GCN-NEXT: ds_read_b128 a[116:119], v3 offset:24656 +; GCN-NEXT: ds_read_b128 a[112:115], v3 offset:24640 +; GCN-NEXT: ds_read_b128 a[108:111], v3 offset:24624 +; GCN-NEXT: ds_read_b128 a[104:107], v3 offset:24608 +; GCN-NEXT: ds_read_b128 a[100:103], v3 offset:24592 +; GCN-NEXT: ds_read_b128 a[96:99], v3 offset:24576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v3, a[96:127] -; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:49264 -; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:49248 -; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:49232 -; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:49216 -; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:49200 -; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:49184 -; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:49168 -; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:49152 -; GCN-NEXT: v_add_u32_e32 v1, 0x6000, v1 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v1, a[96:127] +; GCN-NEXT: ds_read_b128 a[92:95], v3 offset:49264 +; GCN-NEXT: ds_read_b128 a[88:91], v3 offset:49248 +; GCN-NEXT: ds_read_b128 a[84:87], v3 offset:49232 +; GCN-NEXT: ds_read_b128 a[80:83], v3 offset:49216 +; GCN-NEXT: ds_read_b128 a[76:79], v3 offset:49200 +; GCN-NEXT: ds_read_b128 a[72:75], v3 offset:49184 +; GCN-NEXT: ds_read_b128 a[68:71], v3 offset:49168 +; GCN-NEXT: ds_read_b128 a[64:67], v3 offset:49152 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v3, a[64:95] -; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:57456 -; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:57440 -; GCN-NEXT: ds_read_b128 a[52:55], v1 offset:57424 -; GCN-NEXT: ds_read_b128 a[48:51], v1 offset:57408 -; GCN-NEXT: ds_read_b128 a[32:35], v1 offset:57344 -; GCN-NEXT: ds_read_b128 a[36:39], v1 offset:57360 -; GCN-NEXT: ds_read_b128 a[40:43], v1 offset:57376 -; GCN-NEXT: ds_read_b128 a[44:47], v1 offset:57392 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95] +; GCN-NEXT: ds_read_b128 a[60:63], v4 offset:57456 +; GCN-NEXT: ds_read_b128 a[56:59], v4 offset:57440 +; GCN-NEXT: ds_read_b128 a[52:55], v4 offset:57424 +; GCN-NEXT: ds_read_b128 a[48:51], v4 offset:57408 +; GCN-NEXT: ds_read_b128 a[32:35], v4 offset:57344 +; GCN-NEXT: ds_read_b128 a[36:39], v4 offset:57360 +; GCN-NEXT: ds_read_b128 a[40:43], v4 offset:57376 +; GCN-NEXT: ds_read_b128 a[44:47], v4 offset:57392 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63] +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v1, a[32:63] ; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112 ; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:96 ; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:80 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll index aa099b60ef16d..b65a1a8e06c7d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll @@ -623,62 +623,62 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 1.0 +; GCN-NEXT: v_mov_b32_e32 v1, 2.0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_u32_e32 v1, s0, v0 -; GCN-NEXT: ds_read_b128 a[156:159], v1 offset:112 -; GCN-NEXT: ds_read_b128 a[152:155], v1 offset:96 -; GCN-NEXT: ds_read_b128 a[148:151], v1 offset:80 -; GCN-NEXT: ds_read_b128 a[144:147], v1 offset:64 -; GCN-NEXT: ds_read_b128 a[128:131], v1 -; GCN-NEXT: ds_read_b128 a[132:135], v1 offset:16 -; GCN-NEXT: ds_read_b128 a[136:139], v1 offset:32 -; GCN-NEXT: ds_read_b128 a[140:143], v1 offset:48 -; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:8304 -; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:8288 -; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:8272 -; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:8256 -; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:8240 -; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:8224 -; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:8208 -; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:8192 -; GCN-NEXT: v_add_u32_e32 v2, 0x6000, v1 -; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:24688 -; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:24672 -; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:24656 -; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:24640 -; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:24624 -; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:24608 -; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:24592 -; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:24576 -; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:49264 -; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:49248 -; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:49232 -; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:49216 -; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:49200 -; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:49184 -; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:49168 -; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:49152 -; GCN-NEXT: v_mov_b32_e32 v1, 1.0 -; GCN-NEXT: ds_read_b128 a[60:63], v2 offset:57456 -; GCN-NEXT: ds_read_b128 a[56:59], v2 offset:57440 -; GCN-NEXT: ds_read_b128 a[52:55], v2 offset:57424 -; GCN-NEXT: ds_read_b128 a[48:51], v2 offset:57408 -; GCN-NEXT: ds_read_b128 a[32:35], v2 offset:57344 -; GCN-NEXT: ds_read_b128 a[36:39], v2 offset:57360 -; GCN-NEXT: ds_read_b128 a[40:43], v2 offset:57376 -; GCN-NEXT: ds_read_b128 a[44:47], v2 offset:57392 -; GCN-NEXT: v_mov_b32_e32 v2, 2.0 +; GCN-NEXT: v_add_u32_e32 v3, s0, v0 +; GCN-NEXT: ds_read_b128 a[156:159], v3 offset:112 +; GCN-NEXT: ds_read_b128 a[152:155], v3 offset:96 +; GCN-NEXT: ds_read_b128 a[148:151], v3 offset:80 +; GCN-NEXT: ds_read_b128 a[144:147], v3 offset:64 +; GCN-NEXT: ds_read_b128 a[128:131], v3 +; GCN-NEXT: ds_read_b128 a[132:135], v3 offset:16 +; GCN-NEXT: ds_read_b128 a[136:139], v3 offset:32 +; GCN-NEXT: ds_read_b128 a[140:143], v3 offset:48 +; GCN-NEXT: v_add_u32_e32 v4, 0x6000, v3 +; GCN-NEXT: ds_read_b128 a[28:31], v3 offset:8304 +; GCN-NEXT: ds_read_b128 a[24:27], v3 offset:8288 +; GCN-NEXT: ds_read_b128 a[20:23], v3 offset:8272 +; GCN-NEXT: ds_read_b128 a[16:19], v3 offset:8256 +; GCN-NEXT: ds_read_b128 a[12:15], v3 offset:8240 +; GCN-NEXT: ds_read_b128 a[8:11], v3 offset:8224 +; GCN-NEXT: ds_read_b128 a[4:7], v3 offset:8208 +; GCN-NEXT: ds_read_b128 a[0:3], v3 offset:8192 +; GCN-NEXT: ds_read_b128 a[124:127], v3 offset:24688 +; GCN-NEXT: ds_read_b128 a[120:123], v3 offset:24672 +; GCN-NEXT: ds_read_b128 a[116:119], v3 offset:24656 +; GCN-NEXT: ds_read_b128 a[112:115], v3 offset:24640 +; GCN-NEXT: ds_read_b128 a[108:111], v3 offset:24624 +; GCN-NEXT: ds_read_b128 a[104:107], v3 offset:24608 +; GCN-NEXT: ds_read_b128 a[100:103], v3 offset:24592 +; GCN-NEXT: ds_read_b128 a[96:99], v3 offset:24576 +; GCN-NEXT: ds_read_b128 a[92:95], v3 offset:49264 +; GCN-NEXT: ds_read_b128 a[88:91], v3 offset:49248 +; GCN-NEXT: ds_read_b128 a[84:87], v3 offset:49232 +; GCN-NEXT: ds_read_b128 a[80:83], v3 offset:49216 +; GCN-NEXT: ds_read_b128 a[76:79], v3 offset:49200 +; GCN-NEXT: ds_read_b128 a[72:75], v3 offset:49184 +; GCN-NEXT: ds_read_b128 a[68:71], v3 offset:49168 +; GCN-NEXT: ds_read_b128 a[64:67], v3 offset:49152 +; GCN-NEXT: ds_read_b128 a[60:63], v4 offset:57456 +; GCN-NEXT: ds_read_b128 a[56:59], v4 offset:57440 +; GCN-NEXT: ds_read_b128 a[52:55], v4 offset:57424 +; GCN-NEXT: ds_read_b128 a[48:51], v4 offset:57408 +; GCN-NEXT: ds_read_b128 a[32:35], v4 offset:57344 +; GCN-NEXT: ds_read_b128 a[36:39], v4 offset:57360 +; GCN-NEXT: ds_read_b128 a[40:43], v4 offset:57376 +; GCN-NEXT: ds_read_b128 a[44:47], v4 offset:57392 +; GCN-NEXT: s_waitcnt lgkmcnt(14) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v1, a[128:159] ; GCN-NEXT: v_add_u32_e32 v0, s1, v0 ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(40) SyncID(0) -; GCN-NEXT: s_waitcnt lgkmcnt(14) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v1, v2, a[128:159] -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v1, v2, a[96:127] ; GCN-NEXT: s_waitcnt lgkmcnt(8) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v1, v2, a[64:95] +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95] +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v1, a[96:127] +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63] -; GCN-NEXT: s_nop 12 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v1, a[32:63] +; GCN-NEXT: s_nop 11 ; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:112 ; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:96 ; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:80 @@ -729,62 +729,62 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v2, 1.0 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v1, 2.0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s0, v0 -; EXACTCUTOFF-NEXT: ds_read_b128 a[156:159], v1 offset:112 -; EXACTCUTOFF-NEXT: ds_read_b128 a[152:155], v1 offset:96 -; EXACTCUTOFF-NEXT: ds_read_b128 a[148:151], v1 offset:80 -; EXACTCUTOFF-NEXT: ds_read_b128 a[144:147], v1 offset:64 -; EXACTCUTOFF-NEXT: ds_read_b128 a[128:131], v1 -; EXACTCUTOFF-NEXT: ds_read_b128 a[132:135], v1 offset:16 -; EXACTCUTOFF-NEXT: ds_read_b128 a[136:139], v1 offset:32 -; EXACTCUTOFF-NEXT: ds_read_b128 a[140:143], v1 offset:48 -; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:8304 -; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:8288 -; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:8272 -; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:8256 -; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:8240 -; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:8224 -; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:8208 -; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 offset:8192 -; EXACTCUTOFF-NEXT: v_add_u32_e32 v2, 0x6000, v1 -; EXACTCUTOFF-NEXT: ds_read_b128 a[124:127], v1 offset:24688 -; EXACTCUTOFF-NEXT: ds_read_b128 a[120:123], v1 offset:24672 -; EXACTCUTOFF-NEXT: ds_read_b128 a[116:119], v1 offset:24656 -; EXACTCUTOFF-NEXT: ds_read_b128 a[112:115], v1 offset:24640 -; EXACTCUTOFF-NEXT: ds_read_b128 a[108:111], v1 offset:24624 -; EXACTCUTOFF-NEXT: ds_read_b128 a[104:107], v1 offset:24608 -; EXACTCUTOFF-NEXT: ds_read_b128 a[100:103], v1 offset:24592 -; EXACTCUTOFF-NEXT: ds_read_b128 a[96:99], v1 offset:24576 -; EXACTCUTOFF-NEXT: ds_read_b128 a[92:95], v1 offset:49264 -; EXACTCUTOFF-NEXT: ds_read_b128 a[88:91], v1 offset:49248 -; EXACTCUTOFF-NEXT: ds_read_b128 a[84:87], v1 offset:49232 -; EXACTCUTOFF-NEXT: ds_read_b128 a[80:83], v1 offset:49216 -; EXACTCUTOFF-NEXT: ds_read_b128 a[76:79], v1 offset:49200 -; EXACTCUTOFF-NEXT: ds_read_b128 a[72:75], v1 offset:49184 -; EXACTCUTOFF-NEXT: ds_read_b128 a[68:71], v1 offset:49168 -; EXACTCUTOFF-NEXT: ds_read_b128 a[64:67], v1 offset:49152 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v1, 1.0 -; EXACTCUTOFF-NEXT: ds_read_b128 a[60:63], v2 offset:57456 -; EXACTCUTOFF-NEXT: ds_read_b128 a[56:59], v2 offset:57440 -; EXACTCUTOFF-NEXT: ds_read_b128 a[52:55], v2 offset:57424 -; EXACTCUTOFF-NEXT: ds_read_b128 a[48:51], v2 offset:57408 -; EXACTCUTOFF-NEXT: ds_read_b128 a[32:35], v2 offset:57344 -; EXACTCUTOFF-NEXT: ds_read_b128 a[36:39], v2 offset:57360 -; EXACTCUTOFF-NEXT: ds_read_b128 a[40:43], v2 offset:57376 -; EXACTCUTOFF-NEXT: ds_read_b128 a[44:47], v2 offset:57392 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v2, 2.0 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v3, s0, v0 +; EXACTCUTOFF-NEXT: ds_read_b128 a[156:159], v3 offset:112 +; EXACTCUTOFF-NEXT: ds_read_b128 a[152:155], v3 offset:96 +; EXACTCUTOFF-NEXT: ds_read_b128 a[148:151], v3 offset:80 +; EXACTCUTOFF-NEXT: ds_read_b128 a[144:147], v3 offset:64 +; EXACTCUTOFF-NEXT: ds_read_b128 a[128:131], v3 +; EXACTCUTOFF-NEXT: ds_read_b128 a[132:135], v3 offset:16 +; EXACTCUTOFF-NEXT: ds_read_b128 a[136:139], v3 offset:32 +; EXACTCUTOFF-NEXT: ds_read_b128 a[140:143], v3 offset:48 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v4, 0x6000, v3 +; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v3 offset:8304 +; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v3 offset:8288 +; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v3 offset:8272 +; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v3 offset:8256 +; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v3 offset:8240 +; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v3 offset:8224 +; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v3 offset:8208 +; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v3 offset:8192 +; EXACTCUTOFF-NEXT: ds_read_b128 a[124:127], v3 offset:24688 +; EXACTCUTOFF-NEXT: ds_read_b128 a[120:123], v3 offset:24672 +; EXACTCUTOFF-NEXT: ds_read_b128 a[116:119], v3 offset:24656 +; EXACTCUTOFF-NEXT: ds_read_b128 a[112:115], v3 offset:24640 +; EXACTCUTOFF-NEXT: ds_read_b128 a[108:111], v3 offset:24624 +; EXACTCUTOFF-NEXT: ds_read_b128 a[104:107], v3 offset:24608 +; EXACTCUTOFF-NEXT: ds_read_b128 a[100:103], v3 offset:24592 +; EXACTCUTOFF-NEXT: ds_read_b128 a[96:99], v3 offset:24576 +; EXACTCUTOFF-NEXT: ds_read_b128 a[92:95], v3 offset:49264 +; EXACTCUTOFF-NEXT: ds_read_b128 a[88:91], v3 offset:49248 +; EXACTCUTOFF-NEXT: ds_read_b128 a[84:87], v3 offset:49232 +; EXACTCUTOFF-NEXT: ds_read_b128 a[80:83], v3 offset:49216 +; EXACTCUTOFF-NEXT: ds_read_b128 a[76:79], v3 offset:49200 +; EXACTCUTOFF-NEXT: ds_read_b128 a[72:75], v3 offset:49184 +; EXACTCUTOFF-NEXT: ds_read_b128 a[68:71], v3 offset:49168 +; EXACTCUTOFF-NEXT: ds_read_b128 a[64:67], v3 offset:49152 +; EXACTCUTOFF-NEXT: ds_read_b128 a[60:63], v4 offset:57456 +; EXACTCUTOFF-NEXT: ds_read_b128 a[56:59], v4 offset:57440 +; EXACTCUTOFF-NEXT: ds_read_b128 a[52:55], v4 offset:57424 +; EXACTCUTOFF-NEXT: ds_read_b128 a[48:51], v4 offset:57408 +; EXACTCUTOFF-NEXT: ds_read_b128 a[32:35], v4 offset:57344 +; EXACTCUTOFF-NEXT: ds_read_b128 a[36:39], v4 offset:57360 +; EXACTCUTOFF-NEXT: ds_read_b128 a[40:43], v4 offset:57376 +; EXACTCUTOFF-NEXT: ds_read_b128 a[44:47], v4 offset:57392 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(14) +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v1, a[128:159] ; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s1, v0 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(40) SyncID(0) -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(14) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v1, v2, a[128:159] -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v1, v2, a[96:127] ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(8) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v1, v2, a[64:95] +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95] +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v1, a[96:127] +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63] -; EXACTCUTOFF-NEXT: s_nop 12 +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v1, a[32:63] +; EXACTCUTOFF-NEXT: s_nop 11 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[156:159] offset:112 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[152:155] offset:96 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[148:151] offset:80 diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll index 9a23788f8855a..8803f3ae4906f 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll @@ -367,77 +367,76 @@ bb: define amdgpu_kernel void @illegal_mfma_after_rewrite() #1 { ; CHECK-LABEL: illegal_mfma_after_rewrite: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_mov_b32 s0, 0 -; CHECK-NEXT: s_mov_b32 s1, s0 -; CHECK-NEXT: v_mov_b64_e32 v[28:29], s[0:1] +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_mov_b32 s5, s4 +; CHECK-NEXT: v_mov_b64_e32 v[26:27], s[4:5] ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:3] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def v[16:19] +; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: v_mov_b64_e32 v[6:7], s[2:3] -; CHECK-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; CHECK-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; CHECK-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; CHECK-NEXT: s_mov_b32 s0, 0x3c003c00 ; CHECK-NEXT: s_mov_b32 s1, s0 -; CHECK-NEXT: v_mov_b64_e32 v[30:31], s[0:1] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[4:7], v[26:27], v[26:27], v[0:3] +; CHECK-NEXT: v_mov_b64_e32 v[28:29], s[0:1] ; CHECK-NEXT: s_mov_b32 s0, 0x7e007e00 ; CHECK-NEXT: s_mov_b32 s1, s0 -; CHECK-NEXT: v_accvgpr_write_b32 a0, s0 -; CHECK-NEXT: v_accvgpr_write_b32 a1, s1 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[28:29], v[28:29], v[4:7] -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[30:31], v[4:7] -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[12:15], v[28:29], a[0:1], v[4:7] -; CHECK-NEXT: s_nop 2 -; CHECK-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; CHECK-NEXT: v_mov_b32_e32 v5, v4 -; CHECK-NEXT: v_mov_b32_e32 v6, v4 -; CHECK-NEXT: v_mov_b32_e32 v7, v4 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[8:11] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[4:7], v[26:27], v[26:27], v[4:7] +; CHECK-NEXT: v_mov_b64_e32 v[30:31], s[0:1] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[28:29], v[0:3] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[26:27], v[6:9] +; CHECK-NEXT: s_nop 3 +; CHECK-NEXT: v_cvt_f16_f32_e32 v24, v4 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[12:15], v[26:27], v[30:31], v[0:3] ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], v[28:29], v[28:29], v[4:7] -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def v[4:7] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], v[28:29], v[28:29], v[16:19] -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[24:27], v[28:29], v[30:31], v[4:7] -; CHECK-NEXT: s_nop 5 -; CHECK-NEXT: v_cvt_f16_f32_e32 v17, v8 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[12:15] -; CHECK-NEXT: s_nop 2 -; CHECK-NEXT: v_mov_b64_e32 v[12:13], 0 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[28:29], v[28:29], v[0:3] -; CHECK-NEXT: global_store_short v[12:13], v17, off +; CHECK-NEXT: v_mov_b32_e32 v8, 0x7fc00000 +; CHECK-NEXT: v_mov_b32_e32 v9, v8 +; CHECK-NEXT: v_mov_b32_e32 v10, v8 +; CHECK-NEXT: v_mov_b32_e32 v11, v8 +; CHECK-NEXT: v_cvt_f16_f32_e32 v2, v6 +; CHECK-NEXT: v_mov_b64_e32 v[0:1], 0 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[26:27], v[26:27], v[8:11] +; CHECK-NEXT: global_store_short v[0:1], v2, off ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 -; CHECK-NEXT: v_cvt_f16_f32_e32 v9, v16 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[20:23], v[28:29], v[28:29], v[4:7] -; CHECK-NEXT: global_store_short v[12:13], v9, off -; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v8 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[24:27] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[2:5], v[26:27], v[28:29], v[16:19] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[26:27], v[8:11] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[20:23], v[26:27], v[26:27], v[16:19] +; CHECK-NEXT: s_nop 5 +; CHECK-NEXT: v_cvt_f16_f32_e32 v10, v6 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[26:27], v[12:15] +; CHECK-NEXT: global_store_short v[0:1], v10, off +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[2:5], v[26:27], v[26:27], v[2:5] ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 -; CHECK-NEXT: v_cvt_f16_f32_e32 v14, v0 -; CHECK-NEXT: global_store_short v[12:13], v1, off -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[4:7], v[28:29], v[28:29], v[20:23] +; CHECK-NEXT: s_nop 1 +; CHECK-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CHECK-NEXT: global_store_short v[0:1], v6, off +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], v[26:27], v[26:27], v[20:23] ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 -; CHECK-NEXT: global_store_short v[12:13], v14, off +; CHECK-NEXT: global_store_short v[0:1], v24, off ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[30:31], v[28:29], v[8:11] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[2:5], v[28:29], v[26:27], v[2:5] ; CHECK-NEXT: s_nop 6 -; CHECK-NEXT: v_cvt_f16_f32_e32 v8, v0 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], a[0:1], v[28:29], v[4:7] -; CHECK-NEXT: global_store_short v[12:13], v8, off +; CHECK-NEXT: v_cvt_f16_f32_e32 v6, v2 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[2:5], v[30:31], v[26:27], v[16:19] +; CHECK-NEXT: global_store_short v[0:1], v6, off ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 ; CHECK-NEXT: s_nop 2 -; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CHECK-NEXT: global_store_short v[12:13], v0, off +; CHECK-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CHECK-NEXT: global_store_short v[0:1], v2, off ; CHECK-NEXT: s_endpgm entry: %k0 = call <4 x float> asm sideeffect "; def $0", "=s"() @@ -546,100 +545,14 @@ define void @test_rewrite_mfma_subreg_insert2(double %arg0, double %arg1, ptr ad define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) #0 { ; CHECK-LABEL: test_rewrite_mfma_direct_copy_from_agpr_class: ; CHECK: ; %bb.0: +; CHECK-NEXT: v_accvgpr_write_b32 a34, 2.0 +; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; CHECK-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:31] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_accvgpr_write_b32 a32, v0 -; CHECK-NEXT: v_accvgpr_read_b32 v63, a31 -; CHECK-NEXT: v_accvgpr_read_b32 v62, a30 -; CHECK-NEXT: v_accvgpr_read_b32 v61, a29 -; CHECK-NEXT: v_accvgpr_read_b32 v60, a28 -; CHECK-NEXT: v_accvgpr_read_b32 v59, a27 -; CHECK-NEXT: v_accvgpr_read_b32 v58, a26 -; CHECK-NEXT: v_accvgpr_read_b32 v57, a25 -; CHECK-NEXT: v_accvgpr_read_b32 v56, a24 -; CHECK-NEXT: v_accvgpr_read_b32 v55, a23 -; CHECK-NEXT: v_accvgpr_read_b32 v54, a22 -; CHECK-NEXT: v_accvgpr_read_b32 v53, a21 -; CHECK-NEXT: v_accvgpr_read_b32 v52, a20 -; CHECK-NEXT: v_accvgpr_read_b32 v51, a19 -; CHECK-NEXT: v_accvgpr_read_b32 v50, a18 -; CHECK-NEXT: v_accvgpr_read_b32 v49, a17 -; CHECK-NEXT: v_accvgpr_read_b32 v48, a16 -; CHECK-NEXT: v_accvgpr_read_b32 v47, a15 -; CHECK-NEXT: v_accvgpr_read_b32 v46, a14 -; CHECK-NEXT: v_accvgpr_read_b32 v45, a13 -; CHECK-NEXT: v_accvgpr_read_b32 v44, a12 -; CHECK-NEXT: v_accvgpr_read_b32 v43, a11 -; CHECK-NEXT: v_accvgpr_read_b32 v42, a10 -; CHECK-NEXT: v_accvgpr_read_b32 v41, a9 -; CHECK-NEXT: v_accvgpr_read_b32 v40, a8 -; CHECK-NEXT: v_accvgpr_read_b32 v39, a7 -; CHECK-NEXT: v_accvgpr_read_b32 v38, a6 -; CHECK-NEXT: v_accvgpr_read_b32 v37, a5 -; CHECK-NEXT: v_accvgpr_read_b32 v36, a4 -; CHECK-NEXT: v_accvgpr_read_b32 v35, a3 -; CHECK-NEXT: v_accvgpr_read_b32 v34, a2 -; CHECK-NEXT: v_accvgpr_read_b32 v33, a1 -; CHECK-NEXT: v_accvgpr_read_b32 v32, a0 -; CHECK-NEXT: v_accvgpr_write_b32 a0, 2.0 -; CHECK-NEXT: v_accvgpr_write_b32 a1, 4.0 -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], a0, a1, v[32:63] -; CHECK-NEXT: v_accvgpr_write_b32 a0, v32 -; CHECK-NEXT: v_accvgpr_write_b32 a1, v33 -; CHECK-NEXT: v_accvgpr_write_b32 a2, v34 -; CHECK-NEXT: v_accvgpr_write_b32 a3, v35 -; CHECK-NEXT: v_accvgpr_write_b32 a4, v36 -; CHECK-NEXT: v_accvgpr_write_b32 a5, v37 -; CHECK-NEXT: v_accvgpr_write_b32 a6, v38 -; CHECK-NEXT: v_accvgpr_write_b32 a7, v39 -; CHECK-NEXT: v_accvgpr_write_b32 a8, v40 -; CHECK-NEXT: v_accvgpr_write_b32 a9, v41 -; CHECK-NEXT: v_accvgpr_write_b32 a10, v42 -; CHECK-NEXT: v_accvgpr_write_b32 a11, v43 -; CHECK-NEXT: v_accvgpr_write_b32 a12, v44 -; CHECK-NEXT: v_accvgpr_write_b32 a13, v45 -; CHECK-NEXT: v_accvgpr_write_b32 a14, v46 -; CHECK-NEXT: v_accvgpr_write_b32 a15, v47 -; CHECK-NEXT: v_accvgpr_write_b32 a16, v48 -; CHECK-NEXT: v_accvgpr_write_b32 a17, v49 -; CHECK-NEXT: v_accvgpr_write_b32 a18, v50 -; CHECK-NEXT: v_accvgpr_write_b32 a19, v51 -; CHECK-NEXT: v_accvgpr_write_b32 a20, v52 -; CHECK-NEXT: v_accvgpr_write_b32 a21, v53 -; CHECK-NEXT: v_accvgpr_write_b32 a22, v54 -; CHECK-NEXT: v_accvgpr_write_b32 a23, v55 -; CHECK-NEXT: v_accvgpr_write_b32 a24, v56 -; CHECK-NEXT: v_accvgpr_write_b32 a25, v57 -; CHECK-NEXT: v_accvgpr_write_b32 a26, v58 -; CHECK-NEXT: v_accvgpr_write_b32 a27, v59 -; CHECK-NEXT: v_accvgpr_write_b32 a28, v60 -; CHECK-NEXT: v_accvgpr_write_b32 a29, v61 -; CHECK-NEXT: v_accvgpr_write_b32 a30, v62 -; CHECK-NEXT: v_accvgpr_write_b32 a31, v63 -; CHECK-NEXT: v_mov_b32_e32 v33, 0x41000000 -; CHECK-NEXT: v_mov_b32_e32 v34, 0x41800000 -; CHECK-NEXT: v_accvgpr_read_b32 v32, a32 -; CHECK-NEXT: v_and_b32_e32 v32, 0x3ff, v32 -; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v33, v34, a[0:31] -; CHECK-NEXT: v_lshlrev_b32_e32 v32, 7, v32 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 -; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 -; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 -; CHECK-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 -; CHECK-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 -; CHECK-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 -; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 -; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] -; CHECK-NEXT: s_nop 7 ; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 -; CHECK-NEXT: v_accvgpr_read_b32 v24, a24 -; CHECK-NEXT: v_accvgpr_read_b32 v25, a25 -; CHECK-NEXT: v_accvgpr_read_b32 v26, a26 -; CHECK-NEXT: v_accvgpr_read_b32 v27, a27 ; CHECK-NEXT: v_accvgpr_read_b32 v1, a1 ; CHECK-NEXT: v_accvgpr_read_b32 v2, a2 ; CHECK-NEXT: v_accvgpr_read_b32 v3, a3 @@ -663,18 +576,60 @@ define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class(ptr add ; CHECK-NEXT: v_accvgpr_read_b32 v21, a21 ; CHECK-NEXT: v_accvgpr_read_b32 v22, a22 ; CHECK-NEXT: v_accvgpr_read_b32 v23, a23 +; CHECK-NEXT: v_accvgpr_read_b32 v24, a24 +; CHECK-NEXT: v_accvgpr_read_b32 v25, a25 +; CHECK-NEXT: v_accvgpr_read_b32 v26, a26 +; CHECK-NEXT: v_accvgpr_read_b32 v27, a27 ; CHECK-NEXT: v_accvgpr_read_b32 v28, a28 ; CHECK-NEXT: v_accvgpr_read_b32 v29, a29 ; CHECK-NEXT: v_accvgpr_read_b32 v30, a30 ; CHECK-NEXT: v_accvgpr_read_b32 v31, a31 -; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[2:3] offset:96 -; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[2:3] offset:112 -; CHECK-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:64 -; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[2:3] offset:80 -; CHECK-NEXT: global_store_dwordx4 v32, v[8:11], s[2:3] offset:32 -; CHECK-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:48 -; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3] -; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[2:3] offset:16 +; CHECK-NEXT: v_accvgpr_write_b32 a33, 4.0 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[32:63], a34, a33, v[0:31] +; CHECK-NEXT: v_mov_b32_e32 v1, 0x41000000 +; CHECK-NEXT: v_accvgpr_read_b32 v0, a32 +; CHECK-NEXT: s_nop 15 +; CHECK-NEXT: v_mov_b64_e32 v[2:3], v[32:33] +; CHECK-NEXT: v_mov_b64_e32 v[4:5], v[34:35] +; CHECK-NEXT: v_mov_b64_e32 v[6:7], v[36:37] +; CHECK-NEXT: v_mov_b64_e32 v[8:9], v[38:39] +; CHECK-NEXT: v_mov_b64_e32 v[10:11], v[40:41] +; CHECK-NEXT: v_mov_b64_e32 v[12:13], v[42:43] +; CHECK-NEXT: v_mov_b64_e32 v[14:15], v[44:45] +; CHECK-NEXT: v_mov_b64_e32 v[16:17], v[46:47] +; CHECK-NEXT: v_mov_b64_e32 v[18:19], v[48:49] +; CHECK-NEXT: v_mov_b64_e32 v[20:21], v[50:51] +; CHECK-NEXT: v_mov_b64_e32 v[22:23], v[52:53] +; CHECK-NEXT: v_mov_b64_e32 v[24:25], v[54:55] +; CHECK-NEXT: v_mov_b64_e32 v[26:27], v[56:57] +; CHECK-NEXT: v_mov_b64_e32 v[28:29], v[58:59] +; CHECK-NEXT: v_mov_b64_e32 v[30:31], v[60:61] +; CHECK-NEXT: v_mov_b64_e32 v[32:33], v[62:63] +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, v[30:33], s[0:1] offset:112 +; CHECK-NEXT: global_store_dwordx4 v0, v[26:29], s[0:1] offset:96 +; CHECK-NEXT: global_store_dwordx4 v0, v[22:25], s[0:1] offset:80 +; CHECK-NEXT: global_store_dwordx4 v0, v[18:21], s[0:1] offset:64 +; CHECK-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] offset:48 +; CHECK-NEXT: global_store_dwordx4 v0, v[10:13], s[0:1] offset:32 +; CHECK-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 +; CHECK-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] +; CHECK-NEXT: s_nop 1 +; CHECK-NEXT: v_mov_b32_e32 v2, 0x41800000 +; CHECK-NEXT: s_nop 1 +; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31] +; CHECK-NEXT: s_nop 15 +; CHECK-NEXT: s_nop 1 +; CHECK-NEXT: global_store_dwordx4 v0, a[24:27], s[2:3] offset:96 +; CHECK-NEXT: global_store_dwordx4 v0, a[28:31], s[2:3] offset:112 +; CHECK-NEXT: global_store_dwordx4 v0, a[16:19], s[2:3] offset:64 +; CHECK-NEXT: global_store_dwordx4 v0, a[20:23], s[2:3] offset:80 +; CHECK-NEXT: global_store_dwordx4 v0, a[8:11], s[2:3] offset:32 +; CHECK-NEXT: global_store_dwordx4 v0, a[12:15], s[2:3] offset:48 +; CHECK-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3] +; CHECK-NEXT: global_store_dwordx4 v0, a[4:7], s[2:3] offset:16 ; CHECK-NEXT: s_endpgm %src2 = call <32 x float> asm sideeffect "; def $0", "=a"() %mai0 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 2.0, float 4.0, <32 x float> %src2, i32 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/schedule-pending-queue.mir b/llvm/test/CodeGen/AMDGPU/schedule-pending-queue.mir new file mode 100644 index 0000000000000..33b2f69039f48 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/schedule-pending-queue.mir @@ -0,0 +1,32 @@ +# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass machine-scheduler --misched-prera-direction=topdown -verify-machineinstrs %s -o - -debug-only=machine-scheduler 2>&1 | FileCheck %s +# REQUIRES: asserts + +# Check that cycle counts are consistent with hazards. + +# CHECK: Cycle: 3 TopQ.A +# CHECK: hazard: SU(6) HWXDL[0]=9c, is later than CurrCycle = 3c +# CHECK-NOT: Cycle: 9 TopQ.A +# CHECK: Cycle: 83 TopQ.A +# CHECK: Checking pending node SU(6) +# CHECK: Move SU(6) into Available Q + +--- +name: pending_queue_ready_cycle +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr4_sgpr5 + + %2:sgpr_128 = IMPLICIT_DEF + %14:vgpr_32 = IMPLICIT_DEF + %15:vgpr_32 = IMPLICIT_DEF + %18:areg_512 = IMPLICIT_DEF + %18:areg_512 = V_MFMA_F32_16X16X1F32_mac_e64 %15, %14, %18, 0, 0, 0, implicit $mode, implicit $exec + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, implicit $exec + %18:areg_512 = V_MFMA_F32_16X16X1F32_mac_e64 %15, %14, %18, 0, 0, 0, implicit $mode, implicit $exec + undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %5.sub0, %14, implicit $exec + %7:vreg_512 = COPY %18 + SCHED_BARRIER 0 + S_NOP 0, implicit %18, implicit %7, implicit %84 + S_ENDPGM 0 +...