diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 9fbf9e5fe8eeb..64e57caa4df35 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -28,11 +28,20 @@ #include "GCNRegPressure.h" #include "SIMachineFunctionInfo.h" #include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/BitVector.h" #include "llvm/ADT/STLExtras.h" #include "llvm/CodeGen/CalcSpillWeights.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineBlockFrequencyInfo.h" +#include "llvm/CodeGen/MachineBranchProbabilityInfo.h" #include "llvm/CodeGen/RegisterClassInfo.h" #include "llvm/MC/LaneBitmask.h" +#include "llvm/MC/MCInstrItineraries.h" +#include "llvm/MC/MCSchedule.h" +#include "llvm/MC/TargetRegistry.h" #include "llvm/Support/ErrorHandling.h" +#include +#include #define DEBUG_TYPE "machine-scheduler" @@ -970,6 +979,8 @@ void GCNScheduleDAGMILive::schedule() { GCNRegPressure GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const { + if (Regions[RegionIdx].first == Regions[RegionIdx].second) + return llvm::getRegPressure(MRI, LiveIns[RegionIdx]); GCNDownwardRPTracker RPTracker(*LIS); RPTracker.advance(Regions[RegionIdx].first, Regions[RegionIdx].second, &LiveIns[RegionIdx]); @@ -1267,33 +1278,225 @@ bool ClusteredLowOccStage::initGCNSchedStage() { #define REMAT_PREFIX "[PreRARemat] " #define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << REMAT_PREFIX; X;) +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +Printable PreRARematStage::ScoredRemat::print() const { + return Printable([&](raw_ostream &OS) { + OS << '(' << MaxFreq << ", " << FreqDiff << ", " << RegionImpact << ')'; + }); +} +#endif + bool PreRARematStage::initGCNSchedStage() { // FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for // regions inbetween the defs and region we sinked the def to. Will need to be // fixed if there is another pass after this pass. assert(!S.hasNextStage()); - if (!GCNSchedStage::initGCNSchedStage() || DAG.Regions.size() == 1) + if (!GCNSchedStage::initGCNSchedStage() || DAG.Regions.size() <= 1) return false; + if (DAG.MinOccupancy >= MFI.getMaxWavesPerEU() && + !MF.getFunction().hasFnAttribute("amdgpu-num-sgpr") && + !MF.getFunction().hasFnAttribute("amdgpu-num-vgpr")) + return false; + + // Maps all MIs (except lone terminators, which are not part of any region) to + // their parent region. Non-lone terminators are considered part of the region + // they delimitate. + DenseMap MIRegion(MF.getInstructionCount()); // Before performing any IR modification record the parent region of each MI // and the parent MBB of each region. const unsigned NumRegions = DAG.Regions.size(); - RegionBB.reserve(NumRegions); for (unsigned I = 0; I < NumRegions; ++I) { RegionBoundaries Region = DAG.Regions[I]; for (auto MI = Region.first; MI != Region.second; ++MI) MIRegion.insert({&*MI, I}); - RegionBB.push_back(Region.first->getParent()); + MachineBasicBlock *ParentMBB = Region.first->getParent(); + if (Region.second != ParentMBB->end()) + MIRegion.insert({&*Region.second, I}); + RegionBB.push_back(ParentMBB); + } + +#ifndef NDEBUG + auto PrintTargetRegions = [&]() -> void { + if (TargetRegions.none()) { + dbgs() << REMAT_PREFIX << "No target regions\n"; + return; + } + dbgs() << REMAT_PREFIX << "Target regions:\n"; + for (unsigned I : TargetRegions.set_bits()) + dbgs() << REMAT_PREFIX << " [" << I << "] " << RPTargets[I] << '\n'; + }; + auto PrintRematReg = [&](const RematReg &Remat) -> Printable { + return Printable([&, Remat](raw_ostream &OS) { + // Concatenate all region numbers in which the register is unused and + // live-through. + std::string UnusedLTRegions; + for (unsigned I = 0; I < NumRegions; ++I) { + if (Remat.isUnusedLiveThrough(I)) { + if (!UnusedLTRegions.empty()) + UnusedLTRegions += ","; + UnusedLTRegions += std::to_string(I); + } + } + if (!UnusedLTRegions.empty()) + UnusedLTRegions = "- " + UnusedLTRegions + " -"; + OS << "[" << Remat.DefRegion << " -" << UnusedLTRegions << "> " + << Remat.UseRegion << "] "; + Remat.DefMI->print(OS, /*IsStandalone=*/true, /*SkipOpers=*/false, + /*SkipDebugLoc=*/false, /*AddNewLine=*/false); + }); + }; +#endif + + // Set an objective for the stage based on current RP in each region. + REMAT_DEBUG({ + dbgs() << "Analyzing "; + MF.getFunction().printAsOperand(dbgs(), false); + dbgs() << ": "; + }); + if (!setObjective()) { + LLVM_DEBUG(dbgs() << "no objective to achieve, occupancy is maximal at " + << MFI.getMaxWavesPerEU() << '\n'); + return false; + } + LLVM_DEBUG({ + if (TargetOcc) { + dbgs() << "increase occupancy from " << *TargetOcc - 1 << '\n'; + } else { + dbgs() << "reduce spilling (minimum target occupancy is " + << MFI.getMinWavesPerEU() << ")\n"; + } + PrintTargetRegions(); + }); + + if (!collectRematRegs(MIRegion)) { + REMAT_DEBUG(dbgs() << "No rematerializable registers\n"); + return false; } + const ScoredRemat::FreqInfo FreqInfo(MF, DAG); + REMAT_DEBUG({ + dbgs() << "Rematerializable registers:\n"; + for (const RematReg &Remat : RematRegs) + dbgs() << REMAT_PREFIX << " " << PrintRematReg(Remat) << '\n'; + dbgs() << REMAT_PREFIX << "Region frequencies\n"; + for (auto [I, Freq] : enumerate(FreqInfo.Regions)) { + dbgs() << REMAT_PREFIX << " [" << I << "] "; + if (Freq) + dbgs() << Freq; + else + dbgs() << "unknown "; + dbgs() << " | " << *DAG.Regions[I].first; + } + }); + + SmallVector ScoredRemats; + for (const RematReg &Remat : RematRegs) + ScoredRemats.emplace_back(&Remat, FreqInfo, DAG); - if (!canIncreaseOccupancyOrReduceSpill()) +// Rematerialize registers in successive rounds until all RP targets are +// satisifed or until we run out of rematerialization candidates. +#ifndef NDEBUG + unsigned RoundNum = 0; +#endif + BitVector RecomputeRP(NumRegions); + do { + assert(!ScoredRemats.empty() && "no more remat candidates"); + + // (Re-)Score and (re-)sort all remats in increasing score order. + for (ScoredRemat &Remat : ScoredRemats) + Remat.update(TargetRegions, RPTargets, FreqInfo, !TargetOcc); + sort(ScoredRemats); + + REMAT_DEBUG({ + dbgs() << "==== ROUND " << RoundNum << " ====\n" + << REMAT_PREFIX + << "Candidates with non-null score, in rematerialization order:\n"; + for (const ScoredRemat &RematDecision : reverse(ScoredRemats)) { + if (RematDecision.hasNullScore()) + break; + dbgs() << REMAT_PREFIX << " " << RematDecision.print() << " | " + << *RematDecision.Remat->DefMI; + } + PrintTargetRegions(); + }); + + RecomputeRP.reset(); + unsigned RematIdx = ScoredRemats.size(); + + // Rematerialize registers in decreasing score order until we estimate + // that all RP targets are satisfied or until rematerialization candidates + // are no longer useful to decrease RP. + for (; RematIdx && TargetRegions.any(); --RematIdx) { + const ScoredRemat &Candidate = ScoredRemats[RematIdx - 1]; + // Stop rematerializing on encountering a null score. Since scores + // monotonically decrease as we rematerialize, we know there is nothing + // useful left to do in such cases, even if we were to re-score. + if (Candidate.hasNullScore()) { + RematIdx = 0; + break; + } + + const RematReg &Remat = *Candidate.Remat; + // When previous rematerializations in this round have already satisfied + // RP targets in all regions this rematerialization can impact, we have a + // good indication that our scores have diverged significantly from + // reality, in which case we interrupt this round and re-score. This also + // ensures that every rematerialization we perform is possibly impactful + // in at least one target region. + if (!Remat.maybeBeneficial(TargetRegions, RPTargets)) + break; + + REMAT_DEBUG(dbgs() << "** REMAT " << PrintRematReg(Remat) << '\n';); + // Every rematerialization we do here is likely to move the instruction + // into a higher frequency region, increasing the total sum latency of the + // instruction itself. This is acceptable if we are eliminating a spill in + // the process, but when the goal is increasing occupancy we get nothing + // out of rematerialization if occupancy is not increased in the end; in + // such cases we want to roll back the rematerialization. + RollbackInfo *Rollback = + TargetOcc ? &Rollbacks.emplace_back(&Remat) : nullptr; + rematerialize(Remat, RecomputeRP, Rollback); + unsetSatisifedRPTargets(Remat.Live); + } + +#ifndef NDEBUG + ++RoundNum; +#endif + REMAT_DEBUG({ + if (!TargetRegions.any()) { + dbgs() << "** Interrupt round on all targets achieved\n"; + } else if (RematIdx) { + dbgs() << "** Interrupt round on stale score for " + << *ScoredRemats[RematIdx - 1].Remat->DefMI; + } else { + dbgs() << "** Stop on exhausted rematerialization candidates\n"; + } + }); + + // Peel off registers we already rematerialized from the vector's tail. + ScoredRemats.truncate(RematIdx); + } while ((updateAndVerifyRPTargets(RecomputeRP) || TargetRegions.any()) && + !ScoredRemats.empty()); + if (RescheduleRegions.none()) return false; - // Rematerialize identified instructions and update scheduler's state. - rematerialize(); - if (GCNTrackers) - DAG.RegionLiveOuts.buildLiveRegMap(); + // Commit all pressure changes to the DAG and compute minimum achieved + // occupancy in impacted regions. + REMAT_DEBUG(dbgs() << "==== REMAT RESULTS ====\n"); + unsigned DynamicVGPRBlockSize = MFI.getDynamicVGPRBlockSize(); + for (unsigned I : RescheduleRegions.set_bits()) { + DAG.Pressure[I] = RPTargets[I].getCurrentRP(); + REMAT_DEBUG(dbgs() << '[' << I << "] Achieved occupancy " + << DAG.Pressure[I].getOccupancy(ST, DynamicVGPRBlockSize) + << " (" << RPTargets[I] << ")\n"); + } + AchievedOcc = MFI.getMaxWavesPerEU(); + for (const GCNRegPressure &RP : DAG.Pressure) { + AchievedOcc = + std::min(AchievedOcc, RP.getOccupancy(ST, DynamicVGPRBlockSize)); + } + REMAT_DEBUG({ dbgs() << "Retrying function scheduling with new min. occupancy of " << AchievedOcc << " from rematerializing (original was " @@ -1302,7 +1505,6 @@ bool PreRARematStage::initGCNSchedStage() { dbgs() << ", target was " << *TargetOcc; dbgs() << ")\n"; }); - if (AchievedOcc > DAG.MinOccupancy) { DAG.MinOccupancy = AchievedOcc; SIMachineFunctionInfo &MFI = *MF.getInfo(); @@ -1329,6 +1531,10 @@ void UnclusteredHighRPStage::finalizeGCNSchedStage() { } bool GCNSchedStage::initGCNRegion() { + // Skip empty scheduling region. + if (DAG.begin() == DAG.end()) + return false; + // Check whether this new region is also a new block. if (DAG.RegionBegin->getParent() != CurrentMBB) setupNewBlock(); @@ -1336,8 +1542,8 @@ bool GCNSchedStage::initGCNRegion() { unsigned NumRegionInstrs = std::distance(DAG.begin(), DAG.end()); DAG.enterRegion(CurrentMBB, DAG.begin(), DAG.end(), NumRegionInstrs); - // Skip empty scheduling regions (0 or 1 schedulable instructions). - if (DAG.begin() == DAG.end() || DAG.begin() == std::prev(DAG.end())) + // Skip regions with 1 schedulable instruction. + if (DAG.begin() == std::prev(DAG.end())) return false; LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n"); @@ -1811,27 +2017,20 @@ void GCNSchedStage::revertScheduling() { DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd); } -bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() { +bool PreRARematStage::setObjective() { const Function &F = MF.getFunction(); - // Maps optimizable regions (i.e., regions at minimum and register-limited - // occupancy, or regions with spilling) to the target RP we would like to - // reach. - DenseMap OptRegions; + // Set up "spilling targets" for all regions. unsigned MaxSGPRs = ST.getMaxNumSGPRs(F); unsigned MaxVGPRs = ST.getMaxNumVGPRs(F); - auto ResetTargetRegions = [&]() { - OptRegions.clear(); - for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { - const GCNRegPressure &RP = DAG.Pressure[I]; - GCNRPTarget Target(MaxSGPRs, MaxVGPRs, MF, RP); - if (!Target.satisfied()) - OptRegions.insert({I, Target}); - } - }; + for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { + const GCNRegPressure &RP = DAG.Pressure[I]; + GCNRPTarget &Target = RPTargets.emplace_back(MaxSGPRs, MaxVGPRs, MF, RP); + if (!Target.satisfied()) + TargetRegions.set(I); + } - ResetTargetRegions(); - if (!OptRegions.empty() || DAG.MinOccupancy >= MFI.getMaxWavesPerEU()) { + if (TargetRegions.any() || DAG.MinOccupancy >= MFI.getMaxWavesPerEU()) { // In addition to register usage being above addressable limits, occupancy // below the minimum is considered like "spilling" as well. TargetOcc = std::nullopt; @@ -1839,94 +2038,68 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() { // There is no spilling and room to improve occupancy; set up "increased // occupancy targets" for all regions. TargetOcc = DAG.MinOccupancy + 1; - unsigned VGPRBlockSize = - MF.getInfo()->getDynamicVGPRBlockSize(); + const unsigned VGPRBlockSize = MFI.getDynamicVGPRBlockSize(); MaxSGPRs = ST.getMaxNumSGPRs(*TargetOcc, false); MaxVGPRs = ST.getMaxNumVGPRs(*TargetOcc, VGPRBlockSize); - ResetTargetRegions(); - } - REMAT_DEBUG({ - dbgs() << "Analyzing "; - MF.getFunction().printAsOperand(dbgs(), false); - dbgs() << ": "; - if (OptRegions.empty()) { - dbgs() << "no objective to achieve, occupancy is maximal at " - << MFI.getMaxWavesPerEU(); - } else if (!TargetOcc) { - dbgs() << "reduce spilling (minimum target occupancy is " - << MFI.getMinWavesPerEU() << ')'; - } else { - dbgs() << "increase occupancy from " << DAG.MinOccupancy << " to " - << TargetOcc; - } - dbgs() << '\n'; - for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { - if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end()) { - dbgs() << REMAT_PREFIX << " [" << I << "] " << OptIt->getSecond() - << '\n'; - } + for (auto [I, Target] : enumerate(RPTargets)) { + Target.setTarget(MaxSGPRs, MaxVGPRs); + if (!Target.satisfied()) + TargetRegions.set(I); } - }); - if (OptRegions.empty()) - return false; + } - // Accounts for a reduction in RP in an optimizable region. Returns whether we - // estimate that we have identified enough rematerialization opportunities to - // achieve our goal, and sets Progress to true when this particular reduction - // in pressure was helpful toward that goal. - auto ReduceRPInRegion = [&](auto OptIt, Register Reg, LaneBitmask Mask, - bool &Progress) -> bool { - GCNRPTarget &Target = OptIt->getSecond(); - if (!Target.isSaveBeneficial(Reg)) - return false; - Progress = true; - Target.saveReg(Reg, Mask, DAG.MRI); - if (Target.satisfied()) - OptRegions.erase(OptIt->getFirst()); - return OptRegions.empty(); - }; + return TargetRegions.any(); +} +bool PreRARematStage::collectRematRegs( + const DenseMap &MIRegion) { // We need up-to-date live-out info. to query live-out register masks in // regions containing rematerializable instructions. DAG.RegionLiveOuts.buildLiveRegMap(); - // Cache set of registers that are going to be rematerialized. - DenseSet RematRegs; + // Set of registers already marked for potential remterialization; used to + // avoid rematerialization chains. + SmallSet RematRegSet; + auto IsMarkedForRemat = [&RematRegSet](const MachineOperand &MO) -> bool { + return MO.isReg() && RematRegSet.contains(MO.getReg()); + }; // Identify rematerializable instructions in the function. for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { - auto Region = DAG.Regions[I]; - for (auto MI = Region.first; MI != Region.second; ++MI) { + RegionBoundaries Bounds = DAG.Regions[I]; + for (auto MI = Bounds.first; MI != Bounds.second; ++MI) { // The instruction must be rematerializable. MachineInstr &DefMI = *MI; if (!isReMaterializable(DefMI)) continue; - // We only support rematerializing virtual registers with one definition. + // We only support rematerializing virtual registers with one + // definition. Register Reg = DefMI.getOperand(0).getReg(); if (!Reg.isVirtual() || !DAG.MRI.hasOneDef(Reg)) continue; // We only care to rematerialize the instruction if it has a single - // non-debug user in a different region. The using MI may not belong to a - // region if it is a lone region terminator. + // non-debug user in a different region. + // FIXME: Allow rematerializations with multiple uses. This should be + // relatively easy to support using the current cost model. MachineInstr *UseMI = DAG.MRI.getOneNonDBGUser(Reg); if (!UseMI) continue; auto UseRegion = MIRegion.find(UseMI); - if (UseRegion != MIRegion.end() && UseRegion->second == I) + if (UseRegion == MIRegion.end() || UseRegion->second == I) continue; // Do not rematerialize an instruction if it uses or is used by an // instruction that we have designated for rematerialization. // FIXME: Allow for rematerialization chains: this requires 1. updating - // remat points to account for uses that are rematerialized, and 2. either - // rematerializing the candidates in careful ordering, or deferring the - // MBB RP walk until the entire chain has been rematerialized. - if (Rematerializations.contains(UseMI) || - llvm::any_of(DefMI.operands(), [&RematRegs](MachineOperand &MO) { - return MO.isReg() && RematRegs.contains(MO.getReg()); - })) + // remat points to account for uses that are rematerialized, and 2. + // either rematerializing the candidates in careful ordering, or + // deferring the MBB RP walk until the entire chain has been + // rematerialized. + const MachineOperand &UseMO = UseMI->getOperand(0); + if (IsMarkedForRemat(UseMO) || + llvm::any_of(DefMI.operands(), IsMarkedForRemat)) continue; // Do not rematerialize an instruction it it uses registers that aren't @@ -1937,106 +2110,177 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() { *DAG.TII)) continue; - REMAT_DEBUG(dbgs() << "Region " << I << ": remat instruction " << DefMI); - RematInstruction &Remat = - Rematerializations.try_emplace(&DefMI, UseMI).first->second; - - bool RematUseful = false; - if (auto It = OptRegions.find(I); It != OptRegions.end()) { - // Optimistically consider that moving the instruction out of its - // defining region will reduce RP in the latter; this assumes that - // maximum RP in the region is reached somewhere between the defining - // instruction and the end of the region. - REMAT_DEBUG(dbgs() << " Defining region is optimizable\n"); - LaneBitmask Mask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I)[Reg]; - if (ReduceRPInRegion(It, Reg, Mask, RematUseful)) - return true; - } - - for (unsigned LIRegion = 0; LIRegion != E; ++LIRegion) { - // We are only collecting regions in which the register is a live-in - // (and may be live-through). - auto It = DAG.LiveIns[LIRegion].find(Reg); - if (It == DAG.LiveIns[LIRegion].end() || It->second.none()) - continue; - Remat.LiveInRegions.insert(LIRegion); - - // Account for the reduction in RP due to the rematerialization in an - // optimizable region in which the defined register is a live-in. This - // is exact for live-through region but optimistic in the using region, - // where RP is actually reduced only if maximum RP is reached somewhere - // between the beginning of the region and the rematerializable - // instruction's use. - if (auto It = OptRegions.find(LIRegion); It != OptRegions.end()) { - REMAT_DEBUG(dbgs() << " Live-in in region " << LIRegion << '\n'); - if (ReduceRPInRegion(It, Reg, DAG.LiveIns[LIRegion][Reg], - RematUseful)) - return true; - } - } - - // If the instruction is not a live-in or live-out in any optimizable - // region then there is no point in rematerializing it. - if (!RematUseful) { - Rematerializations.pop_back(); - REMAT_DEBUG(dbgs() << " No impact, not rematerializing instruction\n"); - } else { - RematRegs.insert(Reg); - } + // Add the instruction to the rematerializable list. + RematRegSet.insert(Reg); + RematRegs.emplace_back(&DefMI, UseMI, DAG, MIRegion); } } - if (TargetOcc) { - // We were trying to increase occupancy but failed, abort the stage. - REMAT_DEBUG(dbgs() << "Cannot increase occupancy\n"); - Rematerializations.clear(); - return false; + return !RematRegs.empty(); +} + +PreRARematStage::RematReg::RematReg( + MachineInstr *DefMI, MachineInstr *UseMI, GCNScheduleDAGMILive &DAG, + const DenseMap &MIRegion) + : DefMI(DefMI), UseMI(UseMI), LiveIn(DAG.Regions.size()), + LiveOut(DAG.Regions.size()), Live(DAG.Regions.size()), + DefRegion(MIRegion.at(DefMI)), UseRegion(MIRegion.at(UseMI)) { + + // Mark regions in which the rematerializable register is live. + Register Reg = getReg(); + for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { + auto LiveInIt = DAG.LiveIns[I].find(Reg); + if (LiveInIt != DAG.LiveIns[I].end() && LiveInIt->second.any()) + LiveIn.set(I); + auto LiveOutIt = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I).find(Reg); + auto LiveOutEnd = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I).end(); + if (LiveOutIt != LiveOutEnd && LiveOutIt->second.any()) + LiveOut.set(I); + } + Live |= LiveIn; + Live |= LiveOut; + + // Store the register's lane bitmask. + unsigned SubIdx = DefMI->getOperand(0).getSubReg(); + Mask = SubIdx ? DAG.TRI->getSubRegIndexLaneMask(SubIdx) + : DAG.MRI.getMaxLaneMaskForVReg(Reg); +} + +bool PreRARematStage::RematReg::maybeBeneficial( + const BitVector &TargetRegions, ArrayRef RPTargets) const { + Register Reg = getReg(); + for (unsigned I : TargetRegions.set_bits()) { + if (Live[I] && RPTargets[I].isSaveBeneficial(Reg)) + return true; } - REMAT_DEBUG(dbgs() << "Can reduce but not eliminate spilling\n"); - return !Rematerializations.empty(); + return false; } -void PreRARematStage::rematerialize() { - const SIInstrInfo *TII = MF.getSubtarget().getInstrInfo(); +void PreRARematStage::RematReg::insertMI(unsigned RegionIdx, + MachineInstr *RematMI, + GCNScheduleDAGMILive &DAG) const { + RegionBoundaries &Bounds = DAG.Regions[RegionIdx]; + if (Bounds.first == std::next(MachineBasicBlock::iterator(RematMI))) + Bounds.first = RematMI; + DAG.LIS->InsertMachineInstrInMaps(*RematMI); + DAG.LIS->createAndComputeVirtRegInterval(RematMI->getOperand(0).getReg()); +} - // Collect regions whose RP changes in unpredictable way; we will have to - // fully recompute their RP after all rematerailizations. - DenseSet RecomputeRP; - - // Rematerialize all instructions. - for (auto &[DefMI, Remat] : Rematerializations) { - MachineBasicBlock::iterator InsertPos(Remat.UseMI); - Register Reg = DefMI->getOperand(0).getReg(); - unsigned DefRegion = MIRegion.at(DefMI); - - // Rematerialize DefMI to its use block. - TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg, - AMDGPU::NoSubRegister, *DefMI, *DAG.TRI); - Remat.RematMI = &*std::prev(InsertPos); - DAG.LIS->InsertMachineInstrInMaps(*Remat.RematMI); - - // Update region boundaries in regions we sinked from (remove defining MI) - // and to (insert MI rematerialized in use block). Only then we can erase - // the original MI. - DAG.updateRegionBoundaries(DAG.Regions[DefRegion], DefMI, nullptr); - auto UseRegion = MIRegion.find(Remat.UseMI); - if (UseRegion != MIRegion.end()) { - DAG.updateRegionBoundaries(DAG.Regions[UseRegion->second], InsertPos, - Remat.RematMI); - } - DAG.LIS->RemoveMachineInstrFromMaps(*DefMI); - DefMI->eraseFromParent(); +PreRARematStage::ScoredRemat::FreqInfo::FreqInfo( + MachineFunction &MF, const GCNScheduleDAGMILive &DAG) { + assert(DAG.MLI && "MLI not defined in DAG"); + MachineBranchProbabilityInfo MBPI; + MachineBlockFrequencyInfo MBFI(MF, MBPI, *DAG.MLI); + + const unsigned NumRegions = DAG.Regions.size(); + uint64_t MinFreq = MBFI.getEntryFreq().getFrequency(); + Regions.reserve(NumRegions); + for (unsigned I = 0; I < NumRegions; ++I) { + MachineBasicBlock *MBB = DAG.Regions[I].first->getParent(); + uint64_t BlockFreq = MBFI.getBlockFreq(MBB).getFrequency(); + Regions.push_back(BlockFreq); + if (BlockFreq && BlockFreq < MinFreq) + MinFreq = BlockFreq; + else if (BlockFreq > MaxFreq) + MaxFreq = BlockFreq; + } + if (!MinFreq) + return; + + // Normalize to minimum observed frequency to avoid underflows/overflows when + // combining frequencies. + for (uint64_t &Freq : Regions) + Freq /= MinFreq; + MaxFreq /= MinFreq; +} + +PreRARematStage::ScoredRemat::ScoredRemat(const RematReg *Remat, + const FreqInfo &Freq, + const GCNScheduleDAGMILive &DAG) + : Remat(Remat), NumRegs(getNumRegs(DAG)), FreqDiff(getFreqDiff(Freq)) {} + +unsigned PreRARematStage::ScoredRemat::getNumRegs( + const GCNScheduleDAGMILive &DAG) const { + const TargetRegisterClass &RC = *DAG.MRI.getRegClass(Remat->getReg()); + unsigned RegSize = DAG.TRI->getRegSizeInBits(RC); + if (unsigned SubIdx = Remat->DefMI->getOperand(0).getSubReg()) { + // The following may return -1 (i.e., a large unsigned number) on indices + // that may be used to access subregisters of multiple sizes; in such cases + // fallback on the size derived from the register class. + unsigned SubRegSize = DAG.TRI->getSubRegIdxSize(SubIdx); + if (SubRegSize < RegSize) + RegSize = SubRegSize; + } + return divideCeil(RegSize, 32); +} + +int64_t PreRARematStage::ScoredRemat::getFreqDiff(const FreqInfo &Freq) const { + // Get frequencies of defining and using regions. A rematerialization from the + // least frequent region to the most frequent region will yield the greatest + // latency penalty and therefore should get minimum score. Reciprocally, a + // rematerialization in the other direction should get maximum score. Default + // to values that will yield the worst possible score given known frequencies + // in order to penalize rematerializations from or into regions whose + // frequency is unknown. + uint64_t DefOrOne = std::max(Freq.Regions[Remat->DefRegion], (uint64_t)1); + uint64_t UseOrMax = Freq.Regions[Remat->UseRegion]; + if (!UseOrMax) + UseOrMax = Freq.MaxFreq; + return DefOrOne - UseOrMax; +} + +void PreRARematStage::ScoredRemat::update(const BitVector &TargetRegions, + ArrayRef RPTargets, + const FreqInfo &FreqInfo, + bool ReduceSpill) { + MaxFreq = 0; + RegionImpact = 0; + for (unsigned I : TargetRegions.set_bits()) { + if (!Remat->Live[I] || !RPTargets[I].isSaveBeneficial(Remat->getReg())) + continue; + bool UnusedLT = Remat->isUnusedLiveThrough(I); + + // Regions in which RP is guaranteed to decrease have more weight. + RegionImpact += UnusedLT ? 2 : 1; - // Collect all regions impacted by the rematerialization and update their - // live-in/RP information. - for (unsigned I : Remat.LiveInRegions) { - ImpactedRegions.insert({I, DAG.Pressure[I]}); - GCNRPTracker::LiveRegSet &RegionLiveIns = DAG.LiveIns[I]; + if (ReduceSpill) { + uint64_t Freq = FreqInfo.Regions[I]; + if (!UnusedLT) { + // Apply a frequency penalty in regions in which we are not sure that RP + // will decrease. + Freq /= 2; + } + MaxFreq = std::max(MaxFreq, Freq); + } + } +} +void PreRARematStage::rematerialize(const RematReg &Remat, + BitVector &RecomputeRP, + RollbackInfo *Rollback) { + const SIInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + MachineInstr &DefMI = *Remat.DefMI; + Register Reg = DefMI.getOperand(0).getReg(); + Register NewReg = DAG.MRI.cloneVirtualRegister(Reg); + + // Rematerialize the register in the region where it is used. + MachineBasicBlock::iterator InsertPos = Remat.UseMI; + TII->reMaterialize(*InsertPos->getParent(), InsertPos, NewReg, 0, DefMI, + *DAG.TRI); + MachineInstr *RematMI = &*std::prev(InsertPos); + Remat.UseMI->substituteRegister(Reg, NewReg, 0, *DAG.TRI); + Remat.insertMI(Remat.UseRegion, RematMI, DAG); + if (Rollback) + Rollback->RematMI = RematMI; + + // Remove the register from all regions where it is a live-in or live-out + // and adjust RP targets. + for (unsigned I : Remat.Live.set_bits()) { #ifdef EXPENSIVE_CHECKS - // All uses are known to be available / live at the remat point. Thus, the - // uses should already be live in to the region. - for (MachineOperand &MO : DefMI->operands()) { + if (!Remat.LiveIn[I] && Remat.LiveOut[I]) { + // All uses are known to be available / live at the remat point. Thus, + // the uses should already be live in to the region. + for (MachineOperand &MO : DefMI.operands()) { if (!MO.isReg() || !MO.getReg() || !MO.readsReg()) continue; @@ -2049,7 +2293,7 @@ void PreRARematStage::rematerialize() { if (LI.hasSubRanges() && MO.getSubReg()) LM = DAG.TRI->getSubRegIndexLaneMask(MO.getSubReg()); - LaneBitmask LiveInMask = RegionLiveIns.at(UseReg); + LaneBitmask LiveInMask = DAG.LiveIns[I].at(UseReg); LaneBitmask UncoveredLanes = LM & ~(LiveInMask & LM); // If this register has lanes not covered by the LiveIns, be sure they // do not map to any subrange. ref: @@ -2060,65 +2304,78 @@ void PreRARematStage::rematerialize() { assert((SR.LaneMask & UncoveredLanes).none()); } } + } #endif - // The register is no longer a live-in in all regions but the one that - // contains the single use. In live-through regions, maximum register - // pressure decreases predictably so we can directly update it. In the - // using region, maximum RP may or may not decrease, so we will mark it - // for re-computation after all materializations have taken place. - LaneBitmask PrevMask = RegionLiveIns[Reg]; - RegionLiveIns.erase(Reg); - RegMasks.insert({{I, Remat.RematMI->getOperand(0).getReg()}, PrevMask}); - if (Remat.UseMI->getParent() != DAG.Regions[I].first->getParent()) - DAG.Pressure[I].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI); - else - RecomputeRP.insert(I); + // This save is guaranteed in regions in which the register is live-through + // and unused but optimistic in all other regions where the register is + // live. + RPTargets[I].saveReg(Reg, Remat.Mask, DAG.MRI); + DAG.LiveIns[I].erase(Reg); + DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I).erase(Reg); + if (!Remat.isUnusedLiveThrough(I)) + RecomputeRP.set(I); + } + + DAG.deleteMI(Remat.DefRegion, &DefMI); + RescheduleRegions |= Remat.Live; +} + +void PreRARematStage::rollback(const RollbackInfo &Rollback, + BitVector &RecomputeRP) const { + const SIInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + auto &[Remat, RematMI] = Rollback; + MachineBasicBlock *MBB = RegionBB[Remat->DefRegion]; + Register Reg = RematMI->getOperand(0).getReg(); + Register NewReg = DAG.MRI.cloneVirtualRegister(Reg); + + // Re-rematerialize MI in its original region. Note that it may not be + // rematerialized exactly in the same position as originally within the + // region, but it should not matter much. + MachineBasicBlock::iterator InsertPos(DAG.Regions[Remat->DefRegion].second); + TII->reMaterialize(*MBB, InsertPos, NewReg, 0, *RematMI, *DAG.TRI); + MachineInstr *ReRematMI = &*std::prev(InsertPos); + REMAT_DEBUG(dbgs() << '[' << Remat->DefRegion << "] Re-rematerialized as " + << *ReRematMI); + Remat->UseMI->substituteRegister(Reg, NewReg, 0, *DAG.TRI); + DAG.deleteMI(Remat->UseRegion, RematMI); + Remat->insertMI(Remat->DefRegion, ReRematMI, DAG); + + // Re-add the register as a live-in/live-out in all regions it used to be + // one in. + std::pair LiveReg(NewReg, Remat->Mask); + for (unsigned I : Remat->LiveIn.set_bits()) + DAG.LiveIns[I].insert(LiveReg); + for (unsigned I : Remat->LiveOut.set_bits()) + DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I).insert(LiveReg); + RecomputeRP |= Remat->Live; +} + +void PreRARematStage::unsetSatisifedRPTargets(const BitVector &Regions) { + for (unsigned I : Regions.set_bits()) { + if (TargetRegions[I] && RPTargets[I].satisfied()) { + REMAT_DEBUG(dbgs() << " [" << I << "] Target reached!\n"); + TargetRegions.reset(I); } - // RP in the region from which the instruction was rematerialized may or may - // not decrease. - ImpactedRegions.insert({DefRegion, DAG.Pressure[DefRegion]}); - RecomputeRP.insert(DefRegion); - - // Recompute live interval to reflect the register's rematerialization. - Register RematReg = Remat.RematMI->getOperand(0).getReg(); - DAG.LIS->removeInterval(RematReg); - DAG.LIS->createAndComputeVirtRegInterval(RematReg); - } - - // All regions impacted by at least one rematerialization must be rescheduled. - // Maximum pressure must also be recomputed for all regions where it changed - // non-predictably and checked against the target occupancy. - unsigned DynamicVGPRBlockSize = - MF.getInfo()->getDynamicVGPRBlockSize(); - AchievedOcc = MFI.getMaxWavesPerEU(); - for (auto &[I, OriginalRP] : ImpactedRegions) { - bool IsEmptyRegion = DAG.Regions[I].first == DAG.Regions[I].second; - RescheduleRegions[I] = !IsEmptyRegion; - if (!RecomputeRP.contains(I)) - continue; + } +} - GCNRegPressure RP; - if (IsEmptyRegion) { - RP = getRegPressure(DAG.MRI, DAG.LiveIns[I]); - } else { - GCNDownwardRPTracker RPT(*DAG.LIS); - auto *NonDbgMI = &*skipDebugInstructionsForward(DAG.Regions[I].first, - DAG.Regions[I].second); - if (NonDbgMI == DAG.Regions[I].second) { - // Region is non-empty but contains only debug instructions. - RP = getRegPressure(DAG.MRI, DAG.LiveIns[I]); - } else { - RPT.reset(*NonDbgMI, &DAG.LiveIns[I]); - RPT.advance(DAG.Regions[I].second); - RP = RPT.moveMaxPressure(); - } +bool PreRARematStage::updateAndVerifyRPTargets(const BitVector &Regions) { + bool TooOptimistic = false; + for (unsigned I : Regions.set_bits()) { + GCNRPTarget &Target = RPTargets[I]; + Target.setRP(DAG.getRealRegPressure(I)); + + // Since we were optimistic in assessing RP decreases in these regions, we + // may need to remark the target as a target region if RP didn't decrease + // as expected. + if (!TargetRegions[I] && !Target.satisfied()) { + REMAT_DEBUG(dbgs() << " [" << I << "] Incorrect RP estimation\n"); + TooOptimistic = true; + TargetRegions.set(I); } - DAG.Pressure[I] = RP; - AchievedOcc = - std::min(AchievedOcc, RP.getOccupancy(ST, DynamicVGPRBlockSize)); } - REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n"); + return TooOptimistic; } // Copied from MachineLICM @@ -2142,79 +2399,33 @@ bool PreRARematStage::isReMaterializable(const MachineInstr &MI) { void PreRARematStage::finalizeGCNSchedStage() { // We consider that reducing spilling is always beneficial so we never // rollback rematerializations in such cases. It's also possible that - // rescheduling lowers occupancy over the one achieved just through remats, in - // which case we do not want to rollback either (the rescheduling was already - // reverted in PreRARematStage::shouldRevertScheduling in such cases). + // rescheduling lowers occupancy over the one achieved just through remats, + // in which case we do not want to rollback either (the rescheduling was + // already reverted in PreRARematStage::shouldRevertScheduling in such + // cases). unsigned MaxOcc = std::max(AchievedOcc, DAG.MinOccupancy); if (!TargetOcc || MaxOcc >= *TargetOcc) return; - REMAT_DEBUG(dbgs() << "Rolling back all rematerializations\n"); - const SIInstrInfo *TII = MF.getSubtarget().getInstrInfo(); - - // Rollback the rematerializations. - for (const auto &[DefMI, Remat] : Rematerializations) { - MachineInstr &RematMI = *Remat.RematMI; - unsigned DefRegion = MIRegion.at(DefMI); - MachineBasicBlock::iterator InsertPos(DAG.Regions[DefRegion].second); - MachineBasicBlock *MBB = RegionBB[DefRegion]; - Register Reg = RematMI.getOperand(0).getReg(); - - // Re-rematerialize MI at the end of its original region. Note that it may - // not be rematerialized exactly in the same position as originally within - // the region, but it should not matter much. - TII->reMaterialize(*MBB, InsertPos, Reg, AMDGPU::NoSubRegister, RematMI, - *DAG.TRI); - MachineInstr *NewMI = &*std::prev(InsertPos); - DAG.LIS->InsertMachineInstrInMaps(*NewMI); - - auto UseRegion = MIRegion.find(Remat.UseMI); - if (UseRegion != MIRegion.end()) { - DAG.updateRegionBoundaries(DAG.Regions[UseRegion->second], RematMI, - nullptr); - } - DAG.updateRegionBoundaries(DAG.Regions[DefRegion], InsertPos, NewMI); - - // Erase rematerialized MI. - DAG.LIS->RemoveMachineInstrFromMaps(RematMI); - RematMI.eraseFromParent(); - - // Recompute live interval for the re-rematerialized register - DAG.LIS->removeInterval(Reg); - DAG.LIS->createAndComputeVirtRegInterval(Reg); - - // Re-add the register as a live-in in all regions it used to be one in. - for (unsigned LIRegion : Remat.LiveInRegions) - DAG.LiveIns[LIRegion].insert({Reg, RegMasks.at({LIRegion, Reg})}); - } - - // Reset RP in all impacted regions. - for (auto &[I, OriginalRP] : ImpactedRegions) - DAG.Pressure[I] = OriginalRP; + // Rollback, then recompute pressure in all affected regions. + REMAT_DEBUG(dbgs() << "==== ROLLBACK ====\n"); + BitVector RecomputeRP(DAG.Regions.size()); + for (const RollbackInfo &Rollback : Rollbacks) + rollback(Rollback, RecomputeRP); + for (unsigned I : RecomputeRP.set_bits()) + DAG.Pressure[I] = DAG.getRealRegPressure(I); GCNSchedStage::finalizeGCNSchedStage(); } -void GCNScheduleDAGMILive::updateRegionBoundaries( - RegionBoundaries &RegionBounds, MachineBasicBlock::iterator MI, - MachineInstr *NewMI) { - assert((!NewMI || NewMI != RegionBounds.second) && - "cannot remove at region end"); - - if (RegionBounds.first == RegionBounds.second) { - assert(NewMI && "cannot remove from an empty region"); - RegionBounds.first = NewMI; - return; - } - - // We only care for modifications at the beginning of a non-empty region since - // the upper region boundary is exclusive. - if (MI != RegionBounds.first) - return; - if (!NewMI) - RegionBounds.first = std::next(MI); // Removal - else - RegionBounds.first = NewMI; // Insertion +void GCNScheduleDAGMILive::deleteMI(unsigned RegionIdx, MachineInstr *MI) { + // It's not possible for the deleted instruction to be upper region boundary + // since we don't delete region terminators. + if (Regions[RegionIdx].first == MI) + Regions[RegionIdx].first = std::next(MachineBasicBlock::iterator(MI)); + LIS->removeInterval(MI->getOperand(0).getReg()); + LIS->RemoveMachineInstrFromMaps(*MI); + MI->eraseFromParent(); } static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) { diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index f357981ac91de..909de698a44a0 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -18,6 +18,8 @@ #include "llvm/ADT/MapVector.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineScheduler.h" +#include +#include namespace llvm { @@ -300,18 +302,12 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive { // Compute and cache live-ins and pressure for all regions in block. void computeBlockPressure(unsigned RegionIdx, const MachineBasicBlock *MBB); - /// If necessary, updates a region's boundaries following insertion ( \p NewMI - /// != nullptr) or removal ( \p NewMI == nullptr) of a \p MI in the region. - /// For an MI removal, this must be called before the MI is actually erased - /// from its parent MBB. - void updateRegionBoundaries(RegionBoundaries &RegionBounds, - MachineBasicBlock::iterator MI, - MachineInstr *NewMI); - void runSchedStages(); std::unique_ptr createSchedStage(GCNSchedStageID SchedStageID); + void deleteMI(unsigned RegionIdx, MachineInstr *MI); + public: GCNScheduleDAGMILive(MachineSchedContext *C, std::unique_ptr S); @@ -447,65 +443,215 @@ class ClusteredLowOccStage : public GCNSchedStage { }; /// Attempts to reduce function spilling or, if there is no spilling, to -/// increase function occupancy by one with respect to ArchVGPR usage by sinking -/// rematerializable instructions to their use. When the stage -/// estimates reducing spilling or increasing occupancy is possible, as few -/// instructions as possible are rematerialized to reduce potential negative +/// increase function occupancy by one with respect to register usage by sinking +/// rematerializable instructions to their use. When the stage estimates that +/// reducing spilling or increasing occupancy is possible, it tries to +/// rematerialize as few registers as possible to reduce potential negative /// effects on function latency. +/// +/// The stage only supports rematerializing registers that meet all of the +/// following constraints. +/// 1. The register is virtual and has a single defining instruction. +/// 2. The single defining instruction is either deemed rematerializable by the +/// target-independent logic, or if not, has no non-constant and +/// non-ignorable physical register use. +/// 3 The register has no virtual register use whose live range would be +/// extended by the rematerialization. +/// 4. The register has a single non-debug user in a different region from its +/// defining region. +/// 5. The register is not used by or using another register that is going to be +/// rematerialized. class PreRARematStage : public GCNSchedStage { private: - /// Useful information about a rematerializable instruction. - struct RematInstruction { - /// Single use of the rematerializable instruction's defined register, - /// located in a different block. + /// A rematerializable register. + struct RematReg { + /// Single MI defining the rematerializable register. + MachineInstr *DefMI; + /// Single user of the rematerializable register. MachineInstr *UseMI; - /// Rematerialized version of \p DefMI, set in - /// PreRARematStage::rematerialize. Used for reverting rematerializations. + /// Regions in which the register is live-in/live-out/live anywhere. + BitVector LiveIn, LiveOut, Live; + /// The rematerializable register's lane bitmask. + LaneBitmask Mask; + /// Defining and using regions. + unsigned DefRegion, UseRegion; + + RematReg(MachineInstr *DefMI, MachineInstr *UseMI, + GCNScheduleDAGMILive &DAG, + const DenseMap &MIRegion); + + /// Returns the rematerializable register. Do not call after deleting the + /// original defining instruction. + Register getReg() const { return DefMI->getOperand(0).getReg(); } + + /// Determines whether this rematerialization may be beneficial in at least + /// one target region. + bool maybeBeneficial(const BitVector &TargetRegions, + ArrayRef RPTargets) const; + + /// Determines if the register is both unused and live-through in region \p + /// I. This guarantees that rematerializing it will reduce RP in the region. + bool isUnusedLiveThrough(unsigned I) const { + assert(I < Live.size() && "region index out of range"); + return LiveIn[I] && LiveOut[I] && I != UseRegion; + } + + /// Updates internal structures following a MI rematerialization. Part of + /// the stage instead of the DAG because it makes assumptions that are + /// specific to the rematerialization process. + void insertMI(unsigned RegionIdx, MachineInstr *RematMI, + GCNScheduleDAGMILive &DAG) const; + }; + + /// A scored rematerialization candidate. Higher scores indicate more + /// beneficial rematerializations. A null score indicate the rematerialization + /// is not helpful to reduce RP in target regions. + struct ScoredRemat { + /// The rematerializable register under consideration. + const RematReg *Remat; + + /// Execution frequency information required by scoring heuristics. + struct FreqInfo { + /// Per-region execution frequencies, normalized to minimum observed + /// frequency. 0 when unknown. + SmallVector Regions; + /// Maximum observed frequency, normalized to minimum observed frequency. + uint64_t MaxFreq = 0; + + FreqInfo(MachineFunction &MF, const GCNScheduleDAGMILive &DAG); + }; + + /// This only initializes state-independent characteristics of \p Remat, not + /// the actual score. + ScoredRemat(const RematReg *Remat, const FreqInfo &Freq, + const GCNScheduleDAGMILive &DAG); + + /// Updates the rematerialization's score w.r.t. the current \p RPTargets. + /// \p RegionFreq indicates the frequency of each region + void update(const BitVector &TargetRegions, ArrayRef RPTargets, + const FreqInfo &Freq, bool ReduceSpill); + + /// Returns whether the current score is null, indicating the + /// rematerialization is useless. + bool hasNullScore() const { return !MaxFreq && !RegionImpact; } + + /// For each pair of candidates the most important scoring component with + /// non-equal values determine the result of the comparison (higher is + /// better). + bool operator<(const ScoredRemat &O) const { + if (hasNullScore()) + return true; + if (O.hasNullScore()) + return false; + if (MaxFreq != O.MaxFreq) + return MaxFreq < O.MaxFreq; + if (FreqDiff != O.FreqDiff) + return FreqDiff < O.FreqDiff; + if (RegionImpact != O.RegionImpact) + return RegionImpact < O.RegionImpact; + // Break ties using pointer to rematerializable register. + return Remat > O.Remat; + } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + Printable print() const; +#endif + + private: + /// Number of 32-bit registers this rematerialization covers. + const unsigned NumRegs; + + // The three members below are the scoring components, top to bottom from + // most important to least important when comparing candidates. + + /// Frequency of impacted target region with highest known frequency. This + /// only matters when the stage is trying to reduce spilling, so it is + /// always 0 when it is not. + uint64_t MaxFreq; + /// Frequency difference between defining and using regions. Negative values + /// indicate we are rematerializing to higher frequency regions; positive + /// values indicate the contrary. + const int64_t FreqDiff; + /// Expected number of target regions impacted by the rematerialization, + /// scaled by the size of the register being rematerialized. + unsigned RegionImpact; + + unsigned getNumRegs(const GCNScheduleDAGMILive &DAG) const; + + int64_t getFreqDiff(const FreqInfo &Freq) const; + }; + + /// Holds enough information to rollback a rematerialization decision post + /// re-scheduling. + struct RollbackInfo { + /// The rematerializable register under consideration. + const RematReg *Remat; + /// The rematerialized MI replacing the original defining MI. MachineInstr *RematMI; - /// Set of regions in which the rematerializable instruction's defined - /// register is a live-in. - SmallDenseSet LiveInRegions; - RematInstruction(MachineInstr *UseMI) : UseMI(UseMI) {} + RollbackInfo(const RematReg *Remat) : Remat(Remat) {} }; - /// Maps all MIs to their parent region. MI terminators are considered to be - /// outside the region they delimitate, and as such are not stored in the map. - DenseMap MIRegion; /// Parent MBB to each region, in region order. SmallVector RegionBB; - /// Collects instructions to rematerialize. - MapVector Rematerializations; - /// Collects regions whose live-ins or register pressure will change due to - /// rematerializations. - DenseMap ImpactedRegions; - /// In case we need to rollback rematerializations, save lane masks for all - /// rematerialized registers in all regions in which they are live-ins. - DenseMap, LaneBitmask> RegMasks; - /// After successful stage initialization, indicates which regions should be - /// rescheduled. - BitVector RescheduleRegions; - /// The target occupancy the stage is trying to achieve. Empty when the + + /// Register pressure targets for all regions. + SmallVector RPTargets; + /// Regions which are above the stage's RP target. + BitVector TargetRegions; + /// The target occupancy the set is trying to achieve. Empty when the /// objective is spilling reduction. std::optional TargetOcc; /// Achieved occupancy *only* through rematerializations (pre-rescheduling). - /// Smaller than or equal to the target occupancy. + /// Smaller than or equal to the target occupancy, when it is defined. unsigned AchievedOcc; - /// Returns whether remat can reduce spilling or increase function occupancy - /// by 1 through rematerialization. If it can do one, collects instructions in - /// PreRARematStage::Rematerializations and sets the target occupancy in - /// PreRARematStage::TargetOccupancy. - bool canIncreaseOccupancyOrReduceSpill(); + /// List of rematerializable registers. + SmallVector RematRegs; + /// List of rematerializations to rollback if rematerialization does not end + /// up being beneficial. + SmallVector Rollbacks; + /// After successful stage initialization, indicates which regions should be + /// rescheduled. + BitVector RescheduleRegions; + + /// Determines the stage's objective (increasing occupancy or reducing + /// spilling, set in \ref TargetOcc). Defines \ref RPTargets in all regions to + /// achieve that objective and mark those that don't achieve it in \ref + /// TargetRegions. Returns whether there is any target region. + bool setObjective(); + + /// Unsets target regions in \p Regions whose RP target has been reached. + void unsetSatisifedRPTargets(const BitVector &Regions); + + /// Fully recomputes RP from the DAG in \p Regions. Among those regions, sets + /// again all \ref TargetRegions that were optimistically marked as satisfied + /// but are actually not, and returns whether there were any such regions. + bool updateAndVerifyRPTargets(const BitVector &Regions); + + /// Collects all rematerializable registers and appends them to \ref + /// RematRegs. \p MIRegion maps MIs to their region. Returns whether any + /// rematerializable register was found. + bool collectRematRegs(const DenseMap &MIRegion); + + /// Rematerializes \p Remat. This removes the rematerialized register from + /// live-in/out lists in the DAG and updates RP targets in all affected + /// regions, which are also marked in \ref RescheduleRegions. Regions in which + /// RP savings are not guaranteed are set in \p RecomputeRP. When \p Rollback + /// is non-null, fills it with required information to be able to rollback the + /// rematerialization post-rescheduling. + void rematerialize(const RematReg &Remat, BitVector &RecomputeRP, + RollbackInfo *Rollback); + + /// Rollbacks the rematerialization decision represented by \p Rollback. This + /// update live-in/out lists in the DAG but does not update cached register + /// pressures. Regions in which RP may be impacted are marked in \ref + /// RecomputeRP. + void rollback(const RollbackInfo &Rollback, BitVector &RecomputeRP) const; /// Whether the MI is rematerializable bool isReMaterializable(const MachineInstr &MI); - /// Rematerializes all instructions in PreRARematStage::Rematerializations - /// and stores the achieved occupancy after remat in - /// PreRARematStage::AchievedOcc. - void rematerialize(); - /// If remat alone did not increase occupancy to the target one, rollbacks all /// rematerializations and resets live-ins/RP in all regions impacted by the /// stage to their pre-stage values. @@ -519,7 +665,12 @@ class PreRARematStage : public GCNSchedStage { bool shouldRevertScheduling(unsigned WavesAfter) override; PreRARematStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG) - : GCNSchedStage(StageID, DAG), RescheduleRegions(DAG.Regions.size()) {} + : GCNSchedStage(StageID, DAG), TargetRegions(DAG.Regions.size()), + RescheduleRegions(DAG.Regions.size()) { + const unsigned NumRegions = DAG.Regions.size(); + RPTargets.reserve(NumRegions); + RegionBB.reserve(NumRegions); + } }; class ILPInitialScheduleStage : public GCNSchedStage { diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll index 68313807c427f..72e827494e045 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll @@ -419,7 +419,7 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7) ; GISEL-GFX942-NEXT: s_load_dword s11, s[4:5], 0x34 ; GISEL-GFX942-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x44 ; GISEL-GFX942-NEXT: s_mov_b32 s16, 0 -; GISEL-GFX942-NEXT: v_mov_b32_e32 v0, 0x2000 +; GISEL-GFX942-NEXT: v_mov_b32_e32 v0, s16 ; GISEL-GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX942-NEXT: s_mov_b32 s8, s1 ; GISEL-GFX942-NEXT: s_mov_b32 s9, s2 @@ -427,10 +427,10 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7) ; GISEL-GFX942-NEXT: s_mov_b32 s4, s13 ; GISEL-GFX942-NEXT: s_mov_b32 s5, s14 ; GISEL-GFX942-NEXT: s_mov_b32 s6, s15 -; GISEL-GFX942-NEXT: v_mov_b32_e32 v1, s16 +; GISEL-GFX942-NEXT: v_mov_b32_e32 v1, 0x2000 ; GISEL-GFX942-NEXT: .LBB0_1: ; %load-store-loop ; GISEL-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 -; GISEL-GFX942-NEXT: v_add_u32_e32 v62, s0, v1 +; GISEL-GFX942-NEXT: v_add_u32_e32 v62, s0, v0 ; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v62, s[8:11], 0 offen ; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v62, s[8:11], 0 offen offset:16 ; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v62, s[8:11], 0 offen offset:32 @@ -447,9 +447,9 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7) ; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v62, s[8:11], 0 offen offset:208 ; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v62, s[8:11], 0 offen offset:224 ; GISEL-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v62, s[8:11], 0 offen offset:240 -; GISEL-GFX942-NEXT: v_add_u32_e32 v63, s12, v1 -; GISEL-GFX942-NEXT: v_add_u32_e32 v1, 0x100, v1 -; GISEL-GFX942-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0 +; GISEL-GFX942-NEXT: v_add_u32_e32 v63, s12, v0 +; GISEL-GFX942-NEXT: v_add_u32_e32 v0, 0x100, v0 +; GISEL-GFX942-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1 ; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) ; GISEL-GFX942-NEXT: scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen @@ -945,7 +945,7 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp ; GISEL-GFX942-NEXT: s_load_dword s11, s[4:5], 0x34 ; GISEL-GFX942-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x44 ; GISEL-GFX942-NEXT: s_mov_b32 s16, 0 -; GISEL-GFX942-NEXT: v_mov_b32_e32 v0, 0x100 +; GISEL-GFX942-NEXT: v_mov_b32_e32 v0, s16 ; GISEL-GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX942-NEXT: s_mov_b32 s8, s1 ; GISEL-GFX942-NEXT: s_mov_b32 s9, s2 @@ -953,10 +953,10 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp ; GISEL-GFX942-NEXT: s_mov_b32 s4, s13 ; GISEL-GFX942-NEXT: s_mov_b32 s5, s14 ; GISEL-GFX942-NEXT: s_mov_b32 s6, s15 -; GISEL-GFX942-NEXT: v_mov_b32_e32 v1, s16 +; GISEL-GFX942-NEXT: v_mov_b32_e32 v1, 0x100 ; GISEL-GFX942-NEXT: .LBB1_1: ; %load-store-loop ; GISEL-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 -; GISEL-GFX942-NEXT: v_add_u32_e32 v62, s0, v1 +; GISEL-GFX942-NEXT: v_add_u32_e32 v62, s0, v0 ; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v62, s[8:11], 0 offen ; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v62, s[8:11], 0 offen offset:16 ; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v62, s[8:11], 0 offen offset:32 @@ -973,9 +973,9 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp ; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v62, s[8:11], 0 offen offset:208 ; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v62, s[8:11], 0 offen offset:224 ; GISEL-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v62, s[8:11], 0 offen offset:240 -; GISEL-GFX942-NEXT: v_add_u32_e32 v63, s12, v1 -; GISEL-GFX942-NEXT: v_add_u32_e32 v1, 0x100, v1 -; GISEL-GFX942-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0 +; GISEL-GFX942-NEXT: v_add_u32_e32 v63, s12, v0 +; GISEL-GFX942-NEXT: v_add_u32_e32 v0, 0x100, v0 +; GISEL-GFX942-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1 ; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) ; GISEL-GFX942-NEXT: scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-rematerialization-scoring.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-rematerialization-scoring.mir new file mode 100644 index 0000000000000..0bfcb638038fd --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-rematerialization-scoring.mir @@ -0,0 +1,523 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -run-pass=machine-scheduler -amdgpu-disable-unclustered-high-rp-reschedule -verify-machineinstrs %s -o - | FileCheck %s + +# All tests are almost identical, the only differences being that some +# VGPR-defining instructions are progressively made artificially +# unrematerializable with an implicit def to test rematerialization +# priorities. The CFG is the following for all tests in the file. +# +# +---+ +# | 0 | +# +---+ +# | +# v +# +---+ +# +------>| 1 |-----+ +# | +---+ | +# | | v +# | | +---+ +# | | | 2 | +# | | +-+-+ +# | v | +# +---+ +---+ | +# | 4 |<----| 3 |<----+ +# +---+ +---+ +# | +# v +# +---+ +# | 5 | +# +---+ + +# %32's defining and using region frequencies are identical therefore it is the +# best register to rematerialize. +name: favor_same_frequency +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + ; CHECK-LABEL: name: favor_same_frequency + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0, $sgpr0_sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %mem_addr:sgpr_64 = COPY $sgpr0_sgpr1 + ; CHECK-NEXT: %loop_if_bound:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: %mem_data:sreg_64_xexec = S_LOAD_DWORDX2_IMM %mem_addr, 52, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4) + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode + ; CHECK-NEXT: %exec_loop_mask:sreg_64 = V_CMP_GT_U32_e64 %mem_data.sub0, %loop_if_bound, implicit $exec + ; CHECK-NEXT: %loop_counter:sreg_32 = COPY %mem_data.sub1 + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %exec_save_if:sreg_64 = COPY $exec, implicit-def $exec + ; CHECK-NEXT: %exec_if:sreg_64 = S_AND_B64 %exec_save_if, %exec_loop_mask, implicit-def dead $scc + ; CHECK-NEXT: $exec = S_MOV_B64_term %exec_if + ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]], implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]], implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]] + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x7c000000), %bb.5(0x04000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $exec = S_OR_B64 $exec, %exec_save_if, implicit-def $scc + ; CHECK-NEXT: %loop_counter:sreg_32 = S_ADD_I32 %loop_counter, -1, implicit-def dead $scc + ; CHECK-NEXT: S_CMP_LG_U32 %loop_counter, 0, implicit-def $scc + ; CHECK-NEXT: S_CBRANCH_SCC0 %bb.5, implicit killed $scc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]] + ; CHECK-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]], implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]], implicit [[V_CVT_I32_F64_e32_15]] + ; CHECK-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]], implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]] + ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.5: + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode + ; CHECK-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_32]] + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + liveins: $vgpr0, $sgpr0_sgpr1 + + %mem_addr:sgpr_64 = COPY $sgpr0_sgpr1 + %loop_if_bound:vgpr_32 = COPY $vgpr0 + %mem_data:sreg_64_xexec = S_LOAD_DWORDX2_IMM %mem_addr, 52, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4) + %exec_loop_mask:sreg_64 = V_CMP_GT_U32_e64 %mem_data.sub0, killed %loop_if_bound, implicit $exec + %loop_counter:sreg_32 = COPY %mem_data.sub1 + + %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode + %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode + %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode + %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode + %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode + %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode + %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode + %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode + %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode + %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode + %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode + %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode + %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode + %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode + %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode + %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode + %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode + %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode + %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode + %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode + %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode + %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode + %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode + %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode + %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode + %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode + %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode + %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode + %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode + %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode + %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode + %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode + %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode + + bb.1: + successors: %bb.2, %bb.3 + + %exec_save_if:sreg_64 = COPY $exec, implicit-def $exec + %exec_if:sreg_64 = S_AND_B64 %exec_save_if, %exec_loop_mask, implicit-def dead $scc + $exec = S_MOV_B64_term %exec_if + S_CBRANCH_EXECZ %bb.3, implicit $exec + S_BRANCH %bb.2 + + bb.2: + successors: %bb.3 + + S_NOP 0, implicit %24, implicit %25, implicit %26, implicit %27, implicit %28, implicit %29, implicit %30, implicit %31 + + bb.3: + successors: %bb.4(0x7c000000), %bb.5(0x04000000) + + $exec = S_OR_B64 $exec, %exec_save_if, implicit-def $scc + %loop_counter:sreg_32 = S_ADD_I32 %loop_counter, -1, implicit-def dead $scc + S_CMP_LG_U32 %loop_counter, 0, implicit-def $scc + S_CBRANCH_SCC0 %bb.5, implicit killed $scc + + bb.4: + successors: %bb.1 + + S_NOP 0, implicit %0, implicit %1, implicit %2, implicit %3, implicit %4, implicit %5, implicit %6, implicit %7 + S_NOP 0, implicit %8, implicit %9, implicit %10, implicit %11, implicit %12, implicit %13, implicit %14, implicit %15 + S_NOP 0, implicit %16, implicit %17, implicit %18, implicit %19, implicit %20, implicit %21, implicit %22, implicit %23 + + S_BRANCH %bb.1 + + bb.5: + + S_NOP 0, implicit %32 + + S_ENDPGM 0 +... +--- +# bb.2's frequency is lesser than bb.4's therefore it is preferable to +# rematerialize registers in bb.2 instead of bb.4. +name: favor_lower_frequency +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + ; CHECK-LABEL: name: favor_lower_frequency + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0, $sgpr0_sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %mem_addr:sgpr_64 = COPY $sgpr0_sgpr1 + ; CHECK-NEXT: %loop_if_bound:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: %mem_data:sreg_64_xexec = S_LOAD_DWORDX2_IMM %mem_addr, 52, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4) + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0 + ; CHECK-NEXT: %exec_loop_mask:sreg_64 = V_CMP_GT_U32_e64 %mem_data.sub0, %loop_if_bound, implicit $exec + ; CHECK-NEXT: %loop_counter:sreg_32 = COPY %mem_data.sub1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %exec_save_if:sreg_64 = COPY $exec, implicit-def $exec + ; CHECK-NEXT: %exec_if:sreg_64 = S_AND_B64 %exec_save_if, %exec_loop_mask, implicit-def dead $scc + ; CHECK-NEXT: $exec = S_MOV_B64_term %exec_if + ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode + ; CHECK-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]], implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]], implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]] + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x7c000000), %bb.5(0x04000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $exec = S_OR_B64 $exec, %exec_save_if, implicit-def $scc + ; CHECK-NEXT: %loop_counter:sreg_32 = S_ADD_I32 %loop_counter, -1, implicit-def dead $scc + ; CHECK-NEXT: S_CMP_LG_U32 %loop_counter, 0, implicit-def $scc + ; CHECK-NEXT: S_CBRANCH_SCC0 %bb.5, implicit killed $scc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode + ; CHECK-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]] + ; CHECK-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]], implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]] + ; CHECK-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]], implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]] + ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.5: + ; CHECK-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + liveins: $vgpr0, $sgpr0_sgpr1 + + %mem_addr:sgpr_64 = COPY $sgpr0_sgpr1 + %loop_if_bound:vgpr_32 = COPY $vgpr0 + %mem_data:sreg_64_xexec = S_LOAD_DWORDX2_IMM %mem_addr, 52, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4) + %exec_loop_mask:sreg_64 = V_CMP_GT_U32_e64 %mem_data.sub0, killed %loop_if_bound, implicit $exec + %loop_counter:sreg_32 = COPY %mem_data.sub1 + + %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode + %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode + %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode + %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode + %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode + %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode + %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode + %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode + %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode + %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode + %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode + %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode + %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode + %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode + %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode + %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode + %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode + %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode + %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode + %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode + %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode + %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode + %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode + %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode + %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode + %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode + %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode + %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode + %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode + %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode + %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode + %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode + %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0 + + bb.1: + successors: %bb.2, %bb.3 + + %exec_save_if:sreg_64 = COPY $exec, implicit-def $exec + %exec_if:sreg_64 = S_AND_B64 %exec_save_if, %exec_loop_mask, implicit-def dead $scc + $exec = S_MOV_B64_term %exec_if + S_CBRANCH_EXECZ %bb.3, implicit $exec + S_BRANCH %bb.2 + + bb.2: + successors: %bb.3 + + S_NOP 0, implicit %24, implicit %25, implicit %26, implicit %27, implicit %28, implicit %29, implicit %30, implicit %31 + + bb.3: + successors: %bb.4(0x7c000000), %bb.5(0x04000000) + + $exec = S_OR_B64 $exec, %exec_save_if, implicit-def $scc + %loop_counter:sreg_32 = S_ADD_I32 %loop_counter, -1, implicit-def dead $scc + S_CMP_LG_U32 %loop_counter, 0, implicit-def $scc + S_CBRANCH_SCC0 %bb.5, implicit killed $scc + + bb.4: + successors: %bb.1 + + S_NOP 0, implicit %0, implicit %1, implicit %2, implicit %3, implicit %4, implicit %5, implicit %6, implicit %7 + S_NOP 0, implicit %8, implicit %9, implicit %10, implicit %11, implicit %12, implicit %13, implicit %14, implicit %15 + S_NOP 0, implicit %16, implicit %17, implicit %18, implicit %19, implicit %20, implicit %21, implicit %22, implicit %23 + + S_BRANCH %bb.1 + + bb.5: + + S_NOP 0, implicit %32 + + S_ENDPGM 0 +... +--- +# Rematerializing registers used in bb.4 is the only option. +name: remat_in_only_possible_region +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + ; CHECK-LABEL: name: remat_in_only_possible_region + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0, $sgpr0_sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %mem_addr:sgpr_64 = COPY $sgpr0_sgpr1 + ; CHECK-NEXT: %loop_if_bound:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: %mem_data:sreg_64_xexec = S_LOAD_DWORDX2_IMM %mem_addr, 52, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4) + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0 + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0 + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0 + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0 + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0 + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0 + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0 + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0 + ; CHECK-NEXT: %exec_loop_mask:sreg_64 = V_CMP_GT_U32_e64 %mem_data.sub0, %loop_if_bound, implicit $exec + ; CHECK-NEXT: %loop_counter:sreg_32 = COPY %mem_data.sub1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %exec_save_if:sreg_64 = COPY $exec, implicit-def $exec + ; CHECK-NEXT: %exec_if:sreg_64 = S_AND_B64 %exec_save_if, %exec_loop_mask, implicit-def dead $scc + ; CHECK-NEXT: $exec = S_MOV_B64_term %exec_if + ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]], implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]] + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x7c000000), %bb.5(0x04000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $exec = S_OR_B64 $exec, %exec_save_if, implicit-def $scc + ; CHECK-NEXT: %loop_counter:sreg_32 = S_ADD_I32 %loop_counter, -1, implicit-def dead $scc + ; CHECK-NEXT: S_CMP_LG_U32 %loop_counter, 0, implicit-def $scc + ; CHECK-NEXT: S_CBRANCH_SCC0 %bb.5, implicit killed $scc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode + ; CHECK-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]], implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]], implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]] + ; CHECK-NEXT: [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode + ; CHECK-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]] + ; CHECK-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]], implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]] + ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.5: + ; CHECK-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + liveins: $vgpr0, $sgpr0_sgpr1 + + %mem_addr:sgpr_64 = COPY $sgpr0_sgpr1 + %loop_if_bound:vgpr_32 = COPY $vgpr0 + %mem_data:sreg_64_xexec = S_LOAD_DWORDX2_IMM %mem_addr, 52, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4) + %exec_loop_mask:sreg_64 = V_CMP_GT_U32_e64 %mem_data.sub0, killed %loop_if_bound, implicit $exec + %loop_counter:sreg_32 = COPY %mem_data.sub1 + + %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode + %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode + %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode + %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode + %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode + %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode + %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode + %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode + %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode + %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode + %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode + %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode + %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode + %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode + %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode + %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode + %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode + %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode + %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode + %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode + %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode + %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode + %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode + %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode + %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 + %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0 + %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0 + %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0 + %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0 + %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0 + %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0 + %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0 + %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0 + + bb.1: + successors: %bb.2, %bb.3 + + %exec_save_if:sreg_64 = COPY $exec, implicit-def $exec + %exec_if:sreg_64 = S_AND_B64 %exec_save_if, %exec_loop_mask, implicit-def dead $scc + $exec = S_MOV_B64_term %exec_if + S_CBRANCH_EXECZ %bb.3, implicit $exec + S_BRANCH %bb.2 + + bb.2: + successors: %bb.3 + + S_NOP 0, implicit %24, implicit %25, implicit %26, implicit %27, implicit %28, implicit %29, implicit %30, implicit %31 + + bb.3: + successors: %bb.4(0x7c000000), %bb.5(0x04000000) + + $exec = S_OR_B64 $exec, %exec_save_if, implicit-def $scc + %loop_counter:sreg_32 = S_ADD_I32 %loop_counter, -1, implicit-def dead $scc + S_CMP_LG_U32 %loop_counter, 0, implicit-def $scc + S_CBRANCH_SCC0 %bb.5, implicit killed $scc + + bb.4: + successors: %bb.1 + + S_NOP 0, implicit %0, implicit %1, implicit %2, implicit %3, implicit %4, implicit %5, implicit %6, implicit %7 + S_NOP 0, implicit %8, implicit %9, implicit %10, implicit %11, implicit %12, implicit %13, implicit %14, implicit %15 + S_NOP 0, implicit %16, implicit %17, implicit %18, implicit %19, implicit %20, implicit %21, implicit %22, implicit %23 + + S_BRANCH %bb.1 + + bb.5: + + S_NOP 0, implicit %32 + + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir index 3b3ea3f37db80..1daa709ab6439 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir +++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir @@ -37,88 +37,89 @@ body: | ; GFX908: bb.0: ; GFX908-NEXT: successors: %bb.1(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 1 - ; GFX908-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 2 - ; GFX908-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 3 - ; GFX908-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sgpr_32 = S_MOV_B32 4 - ; GFX908-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sgpr_32 = S_MOV_B32 5 - ; GFX908-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sgpr_32 = S_MOV_B32 6 - ; GFX908-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sgpr_32 = S_MOV_B32 7 - ; GFX908-NEXT: [[S_MOV_B32_7:%[0-9]+]]:sgpr_32 = S_MOV_B32 8 - ; GFX908-NEXT: [[S_MOV_B32_8:%[0-9]+]]:sgpr_32 = S_MOV_B32 9 - ; GFX908-NEXT: [[S_MOV_B32_9:%[0-9]+]]:sgpr_32 = S_MOV_B32 10 - ; GFX908-NEXT: [[S_MOV_B32_10:%[0-9]+]]:sgpr_32 = S_MOV_B32 11 - ; GFX908-NEXT: [[S_MOV_B32_11:%[0-9]+]]:sgpr_32 = S_MOV_B32 12 - ; GFX908-NEXT: [[S_MOV_B32_12:%[0-9]+]]:sgpr_32 = S_MOV_B32 13 - ; GFX908-NEXT: [[S_MOV_B32_13:%[0-9]+]]:sgpr_32 = S_MOV_B32 14 - ; GFX908-NEXT: [[S_MOV_B32_14:%[0-9]+]]:sgpr_32 = S_MOV_B32 15 - ; GFX908-NEXT: [[S_MOV_B32_15:%[0-9]+]]:sgpr_32 = S_MOV_B32 16 - ; GFX908-NEXT: [[S_MOV_B32_16:%[0-9]+]]:sgpr_32 = S_MOV_B32 17 - ; GFX908-NEXT: [[S_MOV_B32_17:%[0-9]+]]:sgpr_32 = S_MOV_B32 18 - ; GFX908-NEXT: [[S_MOV_B32_18:%[0-9]+]]:sgpr_32 = S_MOV_B32 19 - ; GFX908-NEXT: [[S_MOV_B32_19:%[0-9]+]]:sgpr_32 = S_MOV_B32 20 - ; GFX908-NEXT: [[S_MOV_B32_20:%[0-9]+]]:sgpr_32 = S_MOV_B32 21 - ; GFX908-NEXT: [[S_MOV_B32_21:%[0-9]+]]:sgpr_32 = S_MOV_B32 22 - ; GFX908-NEXT: [[S_MOV_B32_22:%[0-9]+]]:sgpr_32 = S_MOV_B32 23 - ; GFX908-NEXT: [[S_MOV_B32_23:%[0-9]+]]:sgpr_32 = S_MOV_B32 24 - ; GFX908-NEXT: [[S_MOV_B32_24:%[0-9]+]]:sgpr_32 = S_MOV_B32 25 - ; GFX908-NEXT: [[S_MOV_B32_25:%[0-9]+]]:sgpr_32 = S_MOV_B32 26 - ; GFX908-NEXT: [[S_MOV_B32_26:%[0-9]+]]:sgpr_32 = S_MOV_B32 27 - ; GFX908-NEXT: [[S_MOV_B32_27:%[0-9]+]]:sgpr_32 = S_MOV_B32 28 - ; GFX908-NEXT: [[S_MOV_B32_28:%[0-9]+]]:sgpr_32 = S_MOV_B32 29 - ; GFX908-NEXT: [[S_MOV_B32_29:%[0-9]+]]:sgpr_32 = S_MOV_B32 30 - ; GFX908-NEXT: [[S_MOV_B32_30:%[0-9]+]]:sgpr_32 = S_MOV_B32 31 - ; GFX908-NEXT: [[S_MOV_B32_31:%[0-9]+]]:sgpr_32 = S_MOV_B32 32 - ; GFX908-NEXT: [[S_MOV_B32_32:%[0-9]+]]:sgpr_32 = S_MOV_B32 33 - ; GFX908-NEXT: [[S_MOV_B32_33:%[0-9]+]]:sgpr_32 = S_MOV_B32 34 - ; GFX908-NEXT: [[S_MOV_B32_34:%[0-9]+]]:sgpr_32 = S_MOV_B32 35 - ; GFX908-NEXT: [[S_MOV_B32_35:%[0-9]+]]:sgpr_32 = S_MOV_B32 36 - ; GFX908-NEXT: [[S_MOV_B32_36:%[0-9]+]]:sgpr_32 = S_MOV_B32 37 - ; GFX908-NEXT: [[S_MOV_B32_37:%[0-9]+]]:sgpr_32 = S_MOV_B32 38 - ; GFX908-NEXT: [[S_MOV_B32_38:%[0-9]+]]:sgpr_32 = S_MOV_B32 39 - ; GFX908-NEXT: [[S_MOV_B32_39:%[0-9]+]]:sgpr_32 = S_MOV_B32 40 - ; GFX908-NEXT: [[S_MOV_B32_40:%[0-9]+]]:sgpr_32 = S_MOV_B32 41 - ; GFX908-NEXT: [[S_MOV_B32_41:%[0-9]+]]:sgpr_32 = S_MOV_B32 42 - ; GFX908-NEXT: [[S_MOV_B32_42:%[0-9]+]]:sgpr_32 = S_MOV_B32 43 - ; GFX908-NEXT: [[S_MOV_B32_43:%[0-9]+]]:sgpr_32 = S_MOV_B32 44 - ; GFX908-NEXT: [[S_MOV_B32_44:%[0-9]+]]:sgpr_32 = S_MOV_B32 45 - ; GFX908-NEXT: [[S_MOV_B32_45:%[0-9]+]]:sgpr_32 = S_MOV_B32 46 - ; GFX908-NEXT: [[S_MOV_B32_46:%[0-9]+]]:sgpr_32 = S_MOV_B32 47 - ; GFX908-NEXT: [[S_MOV_B32_47:%[0-9]+]]:sgpr_32 = S_MOV_B32 48 - ; GFX908-NEXT: [[S_MOV_B32_48:%[0-9]+]]:sgpr_32 = S_MOV_B32 49 - ; GFX908-NEXT: [[S_MOV_B32_49:%[0-9]+]]:sgpr_32 = S_MOV_B32 50 - ; GFX908-NEXT: [[S_MOV_B32_50:%[0-9]+]]:sgpr_32 = S_MOV_B32 51 - ; GFX908-NEXT: [[S_MOV_B32_51:%[0-9]+]]:sgpr_32 = S_MOV_B32 52 - ; GFX908-NEXT: [[S_MOV_B32_52:%[0-9]+]]:sgpr_32 = S_MOV_B32 53 - ; GFX908-NEXT: [[S_MOV_B32_53:%[0-9]+]]:sgpr_32 = S_MOV_B32 54 - ; GFX908-NEXT: [[S_MOV_B32_54:%[0-9]+]]:sgpr_32 = S_MOV_B32 55 - ; GFX908-NEXT: [[S_MOV_B32_55:%[0-9]+]]:sgpr_32 = S_MOV_B32 56 - ; GFX908-NEXT: [[S_MOV_B32_56:%[0-9]+]]:sgpr_32 = S_MOV_B32 57 - ; GFX908-NEXT: [[S_MOV_B32_57:%[0-9]+]]:sgpr_32 = S_MOV_B32 58 - ; GFX908-NEXT: [[S_MOV_B32_58:%[0-9]+]]:sgpr_32 = S_MOV_B32 59 - ; GFX908-NEXT: [[S_MOV_B32_59:%[0-9]+]]:sgpr_32 = S_MOV_B32 60 - ; GFX908-NEXT: [[S_MOV_B32_60:%[0-9]+]]:sgpr_32 = S_MOV_B32 61 - ; GFX908-NEXT: [[S_MOV_B32_61:%[0-9]+]]:sgpr_32 = S_MOV_B32 62 - ; GFX908-NEXT: [[S_MOV_B32_62:%[0-9]+]]:sgpr_32 = S_MOV_B32 63 - ; GFX908-NEXT: [[S_MOV_B32_63:%[0-9]+]]:sgpr_32 = S_MOV_B32 64 - ; GFX908-NEXT: [[S_MOV_B32_64:%[0-9]+]]:sgpr_32 = S_MOV_B32 65 - ; GFX908-NEXT: [[S_MOV_B32_65:%[0-9]+]]:sgpr_32 = S_MOV_B32 66 - ; GFX908-NEXT: [[S_MOV_B32_66:%[0-9]+]]:sgpr_32 = S_MOV_B32 67 - ; GFX908-NEXT: [[S_MOV_B32_67:%[0-9]+]]:sgpr_32 = S_MOV_B32 68 - ; GFX908-NEXT: [[S_MOV_B32_68:%[0-9]+]]:sgpr_32 = S_MOV_B32 69 - ; GFX908-NEXT: [[S_MOV_B32_69:%[0-9]+]]:sgpr_32 = S_MOV_B32 70 - ; GFX908-NEXT: [[S_MOV_B32_70:%[0-9]+]]:sgpr_32 = S_MOV_B32 71 - ; GFX908-NEXT: [[S_MOV_B32_71:%[0-9]+]]:sgpr_32 = S_MOV_B32 72 - ; GFX908-NEXT: [[S_MOV_B32_72:%[0-9]+]]:sgpr_32 = S_MOV_B32 73 - ; GFX908-NEXT: [[S_MOV_B32_73:%[0-9]+]]:sgpr_32 = S_MOV_B32 74 - ; GFX908-NEXT: [[S_MOV_B32_74:%[0-9]+]]:sgpr_32 = S_MOV_B32 75 - ; GFX908-NEXT: [[S_MOV_B32_75:%[0-9]+]]:sgpr_32 = S_MOV_B32 76 - ; GFX908-NEXT: [[S_MOV_B32_76:%[0-9]+]]:sgpr_32 = S_MOV_B32 77 - ; GFX908-NEXT: [[S_MOV_B32_77:%[0-9]+]]:sgpr_32 = S_MOV_B32 78 - ; GFX908-NEXT: [[S_MOV_B32_78:%[0-9]+]]:sgpr_32 = S_MOV_B32 79 + ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 6 + ; GFX908-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 7 + ; GFX908-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 8 + ; GFX908-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sgpr_32 = S_MOV_B32 9 + ; GFX908-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sgpr_32 = S_MOV_B32 10 + ; GFX908-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sgpr_32 = S_MOV_B32 11 + ; GFX908-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sgpr_32 = S_MOV_B32 12 + ; GFX908-NEXT: [[S_MOV_B32_7:%[0-9]+]]:sgpr_32 = S_MOV_B32 13 + ; GFX908-NEXT: [[S_MOV_B32_8:%[0-9]+]]:sgpr_32 = S_MOV_B32 14 + ; GFX908-NEXT: [[S_MOV_B32_9:%[0-9]+]]:sgpr_32 = S_MOV_B32 15 + ; GFX908-NEXT: [[S_MOV_B32_10:%[0-9]+]]:sgpr_32 = S_MOV_B32 16 + ; GFX908-NEXT: [[S_MOV_B32_11:%[0-9]+]]:sgpr_32 = S_MOV_B32 17 + ; GFX908-NEXT: [[S_MOV_B32_12:%[0-9]+]]:sgpr_32 = S_MOV_B32 18 + ; GFX908-NEXT: [[S_MOV_B32_13:%[0-9]+]]:sgpr_32 = S_MOV_B32 19 + ; GFX908-NEXT: [[S_MOV_B32_14:%[0-9]+]]:sgpr_32 = S_MOV_B32 20 + ; GFX908-NEXT: [[S_MOV_B32_15:%[0-9]+]]:sgpr_32 = S_MOV_B32 21 + ; GFX908-NEXT: [[S_MOV_B32_16:%[0-9]+]]:sgpr_32 = S_MOV_B32 22 + ; GFX908-NEXT: [[S_MOV_B32_17:%[0-9]+]]:sgpr_32 = S_MOV_B32 23 + ; GFX908-NEXT: [[S_MOV_B32_18:%[0-9]+]]:sgpr_32 = S_MOV_B32 24 + ; GFX908-NEXT: [[S_MOV_B32_19:%[0-9]+]]:sgpr_32 = S_MOV_B32 25 + ; GFX908-NEXT: [[S_MOV_B32_20:%[0-9]+]]:sgpr_32 = S_MOV_B32 26 + ; GFX908-NEXT: [[S_MOV_B32_21:%[0-9]+]]:sgpr_32 = S_MOV_B32 27 + ; GFX908-NEXT: [[S_MOV_B32_22:%[0-9]+]]:sgpr_32 = S_MOV_B32 28 + ; GFX908-NEXT: [[S_MOV_B32_23:%[0-9]+]]:sgpr_32 = S_MOV_B32 29 + ; GFX908-NEXT: [[S_MOV_B32_24:%[0-9]+]]:sgpr_32 = S_MOV_B32 30 + ; GFX908-NEXT: [[S_MOV_B32_25:%[0-9]+]]:sgpr_32 = S_MOV_B32 31 + ; GFX908-NEXT: [[S_MOV_B32_26:%[0-9]+]]:sgpr_32 = S_MOV_B32 32 + ; GFX908-NEXT: [[S_MOV_B32_27:%[0-9]+]]:sgpr_32 = S_MOV_B32 33 + ; GFX908-NEXT: [[S_MOV_B32_28:%[0-9]+]]:sgpr_32 = S_MOV_B32 34 + ; GFX908-NEXT: [[S_MOV_B32_29:%[0-9]+]]:sgpr_32 = S_MOV_B32 35 + ; GFX908-NEXT: [[S_MOV_B32_30:%[0-9]+]]:sgpr_32 = S_MOV_B32 36 + ; GFX908-NEXT: [[S_MOV_B32_31:%[0-9]+]]:sgpr_32 = S_MOV_B32 37 + ; GFX908-NEXT: [[S_MOV_B32_32:%[0-9]+]]:sgpr_32 = S_MOV_B32 38 + ; GFX908-NEXT: [[S_MOV_B32_33:%[0-9]+]]:sgpr_32 = S_MOV_B32 39 + ; GFX908-NEXT: [[S_MOV_B32_34:%[0-9]+]]:sgpr_32 = S_MOV_B32 40 + ; GFX908-NEXT: [[S_MOV_B32_35:%[0-9]+]]:sgpr_32 = S_MOV_B32 41 + ; GFX908-NEXT: [[S_MOV_B32_36:%[0-9]+]]:sgpr_32 = S_MOV_B32 42 + ; GFX908-NEXT: [[S_MOV_B32_37:%[0-9]+]]:sgpr_32 = S_MOV_B32 43 + ; GFX908-NEXT: [[S_MOV_B32_38:%[0-9]+]]:sgpr_32 = S_MOV_B32 44 + ; GFX908-NEXT: [[S_MOV_B32_39:%[0-9]+]]:sgpr_32 = S_MOV_B32 45 + ; GFX908-NEXT: [[S_MOV_B32_40:%[0-9]+]]:sgpr_32 = S_MOV_B32 46 + ; GFX908-NEXT: [[S_MOV_B32_41:%[0-9]+]]:sgpr_32 = S_MOV_B32 47 + ; GFX908-NEXT: [[S_MOV_B32_42:%[0-9]+]]:sgpr_32 = S_MOV_B32 48 + ; GFX908-NEXT: [[S_MOV_B32_43:%[0-9]+]]:sgpr_32 = S_MOV_B32 49 + ; GFX908-NEXT: [[S_MOV_B32_44:%[0-9]+]]:sgpr_32 = S_MOV_B32 50 + ; GFX908-NEXT: [[S_MOV_B32_45:%[0-9]+]]:sgpr_32 = S_MOV_B32 51 + ; GFX908-NEXT: [[S_MOV_B32_46:%[0-9]+]]:sgpr_32 = S_MOV_B32 52 + ; GFX908-NEXT: [[S_MOV_B32_47:%[0-9]+]]:sgpr_32 = S_MOV_B32 53 + ; GFX908-NEXT: [[S_MOV_B32_48:%[0-9]+]]:sgpr_32 = S_MOV_B32 54 + ; GFX908-NEXT: [[S_MOV_B32_49:%[0-9]+]]:sgpr_32 = S_MOV_B32 55 + ; GFX908-NEXT: [[S_MOV_B32_50:%[0-9]+]]:sgpr_32 = S_MOV_B32 56 + ; GFX908-NEXT: [[S_MOV_B32_51:%[0-9]+]]:sgpr_32 = S_MOV_B32 57 + ; GFX908-NEXT: [[S_MOV_B32_52:%[0-9]+]]:sgpr_32 = S_MOV_B32 58 + ; GFX908-NEXT: [[S_MOV_B32_53:%[0-9]+]]:sgpr_32 = S_MOV_B32 59 + ; GFX908-NEXT: [[S_MOV_B32_54:%[0-9]+]]:sgpr_32 = S_MOV_B32 60 + ; GFX908-NEXT: [[S_MOV_B32_55:%[0-9]+]]:sgpr_32 = S_MOV_B32 61 + ; GFX908-NEXT: [[S_MOV_B32_56:%[0-9]+]]:sgpr_32 = S_MOV_B32 62 + ; GFX908-NEXT: [[S_MOV_B32_57:%[0-9]+]]:sgpr_32 = S_MOV_B32 63 + ; GFX908-NEXT: [[S_MOV_B32_58:%[0-9]+]]:sgpr_32 = S_MOV_B32 64 + ; GFX908-NEXT: [[S_MOV_B32_59:%[0-9]+]]:sgpr_32 = S_MOV_B32 65 + ; GFX908-NEXT: [[S_MOV_B32_60:%[0-9]+]]:sgpr_32 = S_MOV_B32 66 + ; GFX908-NEXT: [[S_MOV_B32_61:%[0-9]+]]:sgpr_32 = S_MOV_B32 67 + ; GFX908-NEXT: [[S_MOV_B32_62:%[0-9]+]]:sgpr_32 = S_MOV_B32 68 + ; GFX908-NEXT: [[S_MOV_B32_63:%[0-9]+]]:sgpr_32 = S_MOV_B32 69 + ; GFX908-NEXT: [[S_MOV_B32_64:%[0-9]+]]:sgpr_32 = S_MOV_B32 70 + ; GFX908-NEXT: [[S_MOV_B32_65:%[0-9]+]]:sgpr_32 = S_MOV_B32 71 + ; GFX908-NEXT: [[S_MOV_B32_66:%[0-9]+]]:sgpr_32 = S_MOV_B32 72 + ; GFX908-NEXT: [[S_MOV_B32_67:%[0-9]+]]:sgpr_32 = S_MOV_B32 73 + ; GFX908-NEXT: [[S_MOV_B32_68:%[0-9]+]]:sgpr_32 = S_MOV_B32 74 + ; GFX908-NEXT: [[S_MOV_B32_69:%[0-9]+]]:sgpr_32 = S_MOV_B32 75 + ; GFX908-NEXT: [[S_MOV_B32_70:%[0-9]+]]:sgpr_32 = S_MOV_B32 76 + ; GFX908-NEXT: [[S_MOV_B32_71:%[0-9]+]]:sgpr_32 = S_MOV_B32 77 + ; GFX908-NEXT: [[S_MOV_B32_72:%[0-9]+]]:sgpr_32 = S_MOV_B32 78 + ; GFX908-NEXT: [[S_MOV_B32_73:%[0-9]+]]:sgpr_32 = S_MOV_B32 79 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: - ; GFX908-NEXT: [[S_MOV_B32_79:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 + ; GFX908-NEXT: [[S_MOV_B32_74:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 + ; GFX908-NEXT: [[S_MOV_B32_75:%[0-9]+]]:sgpr_32 = S_MOV_B32 1 + ; GFX908-NEXT: [[S_MOV_B32_76:%[0-9]+]]:sgpr_32 = S_MOV_B32 2 + ; GFX908-NEXT: [[S_MOV_B32_77:%[0-9]+]]:sgpr_32 = S_MOV_B32 3 + ; GFX908-NEXT: [[S_MOV_B32_78:%[0-9]+]]:sgpr_32 = S_MOV_B32 4 + ; GFX908-NEXT: S_NOP 0, implicit [[S_MOV_B32_74]], implicit [[S_MOV_B32_75]], implicit [[S_MOV_B32_76]], implicit [[S_MOV_B32_77]], implicit [[S_MOV_B32_78]] + ; GFX908-NEXT: [[S_MOV_B32_79:%[0-9]+]]:sgpr_32 = S_MOV_B32 5 ; GFX908-NEXT: S_NOP 0, implicit [[S_MOV_B32_79]], implicit [[S_MOV_B32_]], implicit [[S_MOV_B32_1]], implicit [[S_MOV_B32_2]], implicit [[S_MOV_B32_3]] ; GFX908-NEXT: S_NOP 0, implicit [[S_MOV_B32_4]], implicit [[S_MOV_B32_5]], implicit [[S_MOV_B32_6]], implicit [[S_MOV_B32_7]], implicit [[S_MOV_B32_8]] ; GFX908-NEXT: S_NOP 0, implicit [[S_MOV_B32_9]], implicit [[S_MOV_B32_10]], implicit [[S_MOV_B32_11]], implicit [[S_MOV_B32_12]], implicit [[S_MOV_B32_13]] @@ -134,95 +135,95 @@ body: | ; GFX908-NEXT: S_NOP 0, implicit [[S_MOV_B32_59]], implicit [[S_MOV_B32_60]], implicit [[S_MOV_B32_61]], implicit [[S_MOV_B32_62]], implicit [[S_MOV_B32_63]] ; GFX908-NEXT: S_NOP 0, implicit [[S_MOV_B32_64]], implicit [[S_MOV_B32_65]], implicit [[S_MOV_B32_66]], implicit [[S_MOV_B32_67]], implicit [[S_MOV_B32_68]] ; GFX908-NEXT: S_NOP 0, implicit [[S_MOV_B32_69]], implicit [[S_MOV_B32_70]], implicit [[S_MOV_B32_71]], implicit [[S_MOV_B32_72]], implicit [[S_MOV_B32_73]] - ; GFX908-NEXT: S_NOP 0, implicit [[S_MOV_B32_74]], implicit [[S_MOV_B32_75]], implicit [[S_MOV_B32_76]], implicit [[S_MOV_B32_77]], implicit [[S_MOV_B32_78]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: small_num_sgprs_as_spill ; GFX90A: bb.0: ; GFX90A-NEXT: successors: %bb.1(0x80000000) ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 1 - ; GFX90A-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 2 - ; GFX90A-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 3 - ; GFX90A-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sgpr_32 = S_MOV_B32 4 - ; GFX90A-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sgpr_32 = S_MOV_B32 5 - ; GFX90A-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sgpr_32 = S_MOV_B32 6 - ; GFX90A-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sgpr_32 = S_MOV_B32 7 - ; GFX90A-NEXT: [[S_MOV_B32_7:%[0-9]+]]:sgpr_32 = S_MOV_B32 8 - ; GFX90A-NEXT: [[S_MOV_B32_8:%[0-9]+]]:sgpr_32 = S_MOV_B32 9 - ; GFX90A-NEXT: [[S_MOV_B32_9:%[0-9]+]]:sgpr_32 = S_MOV_B32 10 - ; GFX90A-NEXT: [[S_MOV_B32_10:%[0-9]+]]:sgpr_32 = S_MOV_B32 11 - ; GFX90A-NEXT: [[S_MOV_B32_11:%[0-9]+]]:sgpr_32 = S_MOV_B32 12 - ; GFX90A-NEXT: [[S_MOV_B32_12:%[0-9]+]]:sgpr_32 = S_MOV_B32 13 - ; GFX90A-NEXT: [[S_MOV_B32_13:%[0-9]+]]:sgpr_32 = S_MOV_B32 14 - ; GFX90A-NEXT: [[S_MOV_B32_14:%[0-9]+]]:sgpr_32 = S_MOV_B32 15 - ; GFX90A-NEXT: [[S_MOV_B32_15:%[0-9]+]]:sgpr_32 = S_MOV_B32 16 - ; GFX90A-NEXT: [[S_MOV_B32_16:%[0-9]+]]:sgpr_32 = S_MOV_B32 17 - ; GFX90A-NEXT: [[S_MOV_B32_17:%[0-9]+]]:sgpr_32 = S_MOV_B32 18 - ; GFX90A-NEXT: [[S_MOV_B32_18:%[0-9]+]]:sgpr_32 = S_MOV_B32 19 - ; GFX90A-NEXT: [[S_MOV_B32_19:%[0-9]+]]:sgpr_32 = S_MOV_B32 20 - ; GFX90A-NEXT: [[S_MOV_B32_20:%[0-9]+]]:sgpr_32 = S_MOV_B32 21 - ; GFX90A-NEXT: [[S_MOV_B32_21:%[0-9]+]]:sgpr_32 = S_MOV_B32 22 - ; GFX90A-NEXT: [[S_MOV_B32_22:%[0-9]+]]:sgpr_32 = S_MOV_B32 23 - ; GFX90A-NEXT: [[S_MOV_B32_23:%[0-9]+]]:sgpr_32 = S_MOV_B32 24 - ; GFX90A-NEXT: [[S_MOV_B32_24:%[0-9]+]]:sgpr_32 = S_MOV_B32 25 - ; GFX90A-NEXT: [[S_MOV_B32_25:%[0-9]+]]:sgpr_32 = S_MOV_B32 26 - ; GFX90A-NEXT: [[S_MOV_B32_26:%[0-9]+]]:sgpr_32 = S_MOV_B32 27 - ; GFX90A-NEXT: [[S_MOV_B32_27:%[0-9]+]]:sgpr_32 = S_MOV_B32 28 - ; GFX90A-NEXT: [[S_MOV_B32_28:%[0-9]+]]:sgpr_32 = S_MOV_B32 29 - ; GFX90A-NEXT: [[S_MOV_B32_29:%[0-9]+]]:sgpr_32 = S_MOV_B32 30 - ; GFX90A-NEXT: [[S_MOV_B32_30:%[0-9]+]]:sgpr_32 = S_MOV_B32 31 - ; GFX90A-NEXT: [[S_MOV_B32_31:%[0-9]+]]:sgpr_32 = S_MOV_B32 32 - ; GFX90A-NEXT: [[S_MOV_B32_32:%[0-9]+]]:sgpr_32 = S_MOV_B32 33 - ; GFX90A-NEXT: [[S_MOV_B32_33:%[0-9]+]]:sgpr_32 = S_MOV_B32 34 - ; GFX90A-NEXT: [[S_MOV_B32_34:%[0-9]+]]:sgpr_32 = S_MOV_B32 35 - ; GFX90A-NEXT: [[S_MOV_B32_35:%[0-9]+]]:sgpr_32 = S_MOV_B32 36 - ; GFX90A-NEXT: [[S_MOV_B32_36:%[0-9]+]]:sgpr_32 = S_MOV_B32 37 - ; GFX90A-NEXT: [[S_MOV_B32_37:%[0-9]+]]:sgpr_32 = S_MOV_B32 38 - ; GFX90A-NEXT: [[S_MOV_B32_38:%[0-9]+]]:sgpr_32 = S_MOV_B32 39 - ; GFX90A-NEXT: [[S_MOV_B32_39:%[0-9]+]]:sgpr_32 = S_MOV_B32 40 - ; GFX90A-NEXT: [[S_MOV_B32_40:%[0-9]+]]:sgpr_32 = S_MOV_B32 41 - ; GFX90A-NEXT: [[S_MOV_B32_41:%[0-9]+]]:sgpr_32 = S_MOV_B32 42 - ; GFX90A-NEXT: [[S_MOV_B32_42:%[0-9]+]]:sgpr_32 = S_MOV_B32 43 - ; GFX90A-NEXT: [[S_MOV_B32_43:%[0-9]+]]:sgpr_32 = S_MOV_B32 44 - ; GFX90A-NEXT: [[S_MOV_B32_44:%[0-9]+]]:sgpr_32 = S_MOV_B32 45 - ; GFX90A-NEXT: [[S_MOV_B32_45:%[0-9]+]]:sgpr_32 = S_MOV_B32 46 - ; GFX90A-NEXT: [[S_MOV_B32_46:%[0-9]+]]:sgpr_32 = S_MOV_B32 47 - ; GFX90A-NEXT: [[S_MOV_B32_47:%[0-9]+]]:sgpr_32 = S_MOV_B32 48 - ; GFX90A-NEXT: [[S_MOV_B32_48:%[0-9]+]]:sgpr_32 = S_MOV_B32 49 - ; GFX90A-NEXT: [[S_MOV_B32_49:%[0-9]+]]:sgpr_32 = S_MOV_B32 50 - ; GFX90A-NEXT: [[S_MOV_B32_50:%[0-9]+]]:sgpr_32 = S_MOV_B32 51 - ; GFX90A-NEXT: [[S_MOV_B32_51:%[0-9]+]]:sgpr_32 = S_MOV_B32 52 - ; GFX90A-NEXT: [[S_MOV_B32_52:%[0-9]+]]:sgpr_32 = S_MOV_B32 53 - ; GFX90A-NEXT: [[S_MOV_B32_53:%[0-9]+]]:sgpr_32 = S_MOV_B32 54 - ; GFX90A-NEXT: [[S_MOV_B32_54:%[0-9]+]]:sgpr_32 = S_MOV_B32 55 - ; GFX90A-NEXT: [[S_MOV_B32_55:%[0-9]+]]:sgpr_32 = S_MOV_B32 56 - ; GFX90A-NEXT: [[S_MOV_B32_56:%[0-9]+]]:sgpr_32 = S_MOV_B32 57 - ; GFX90A-NEXT: [[S_MOV_B32_57:%[0-9]+]]:sgpr_32 = S_MOV_B32 58 - ; GFX90A-NEXT: [[S_MOV_B32_58:%[0-9]+]]:sgpr_32 = S_MOV_B32 59 - ; GFX90A-NEXT: [[S_MOV_B32_59:%[0-9]+]]:sgpr_32 = S_MOV_B32 60 - ; GFX90A-NEXT: [[S_MOV_B32_60:%[0-9]+]]:sgpr_32 = S_MOV_B32 61 - ; GFX90A-NEXT: [[S_MOV_B32_61:%[0-9]+]]:sgpr_32 = S_MOV_B32 62 - ; GFX90A-NEXT: [[S_MOV_B32_62:%[0-9]+]]:sgpr_32 = S_MOV_B32 63 - ; GFX90A-NEXT: [[S_MOV_B32_63:%[0-9]+]]:sgpr_32 = S_MOV_B32 64 - ; GFX90A-NEXT: [[S_MOV_B32_64:%[0-9]+]]:sgpr_32 = S_MOV_B32 65 - ; GFX90A-NEXT: [[S_MOV_B32_65:%[0-9]+]]:sgpr_32 = S_MOV_B32 66 - ; GFX90A-NEXT: [[S_MOV_B32_66:%[0-9]+]]:sgpr_32 = S_MOV_B32 67 - ; GFX90A-NEXT: [[S_MOV_B32_67:%[0-9]+]]:sgpr_32 = S_MOV_B32 68 - ; GFX90A-NEXT: [[S_MOV_B32_68:%[0-9]+]]:sgpr_32 = S_MOV_B32 69 - ; GFX90A-NEXT: [[S_MOV_B32_69:%[0-9]+]]:sgpr_32 = S_MOV_B32 70 - ; GFX90A-NEXT: [[S_MOV_B32_70:%[0-9]+]]:sgpr_32 = S_MOV_B32 71 - ; GFX90A-NEXT: [[S_MOV_B32_71:%[0-9]+]]:sgpr_32 = S_MOV_B32 72 - ; GFX90A-NEXT: [[S_MOV_B32_72:%[0-9]+]]:sgpr_32 = S_MOV_B32 73 - ; GFX90A-NEXT: [[S_MOV_B32_73:%[0-9]+]]:sgpr_32 = S_MOV_B32 74 - ; GFX90A-NEXT: [[S_MOV_B32_74:%[0-9]+]]:sgpr_32 = S_MOV_B32 75 - ; GFX90A-NEXT: [[S_MOV_B32_75:%[0-9]+]]:sgpr_32 = S_MOV_B32 76 - ; GFX90A-NEXT: [[S_MOV_B32_76:%[0-9]+]]:sgpr_32 = S_MOV_B32 77 - ; GFX90A-NEXT: [[S_MOV_B32_77:%[0-9]+]]:sgpr_32 = S_MOV_B32 78 - ; GFX90A-NEXT: [[S_MOV_B32_78:%[0-9]+]]:sgpr_32 = S_MOV_B32 79 + ; GFX90A-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 6 + ; GFX90A-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 7 + ; GFX90A-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 8 + ; GFX90A-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sgpr_32 = S_MOV_B32 9 + ; GFX90A-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sgpr_32 = S_MOV_B32 10 + ; GFX90A-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sgpr_32 = S_MOV_B32 11 + ; GFX90A-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sgpr_32 = S_MOV_B32 12 + ; GFX90A-NEXT: [[S_MOV_B32_7:%[0-9]+]]:sgpr_32 = S_MOV_B32 13 + ; GFX90A-NEXT: [[S_MOV_B32_8:%[0-9]+]]:sgpr_32 = S_MOV_B32 14 + ; GFX90A-NEXT: [[S_MOV_B32_9:%[0-9]+]]:sgpr_32 = S_MOV_B32 15 + ; GFX90A-NEXT: [[S_MOV_B32_10:%[0-9]+]]:sgpr_32 = S_MOV_B32 16 + ; GFX90A-NEXT: [[S_MOV_B32_11:%[0-9]+]]:sgpr_32 = S_MOV_B32 17 + ; GFX90A-NEXT: [[S_MOV_B32_12:%[0-9]+]]:sgpr_32 = S_MOV_B32 18 + ; GFX90A-NEXT: [[S_MOV_B32_13:%[0-9]+]]:sgpr_32 = S_MOV_B32 19 + ; GFX90A-NEXT: [[S_MOV_B32_14:%[0-9]+]]:sgpr_32 = S_MOV_B32 20 + ; GFX90A-NEXT: [[S_MOV_B32_15:%[0-9]+]]:sgpr_32 = S_MOV_B32 21 + ; GFX90A-NEXT: [[S_MOV_B32_16:%[0-9]+]]:sgpr_32 = S_MOV_B32 22 + ; GFX90A-NEXT: [[S_MOV_B32_17:%[0-9]+]]:sgpr_32 = S_MOV_B32 23 + ; GFX90A-NEXT: [[S_MOV_B32_18:%[0-9]+]]:sgpr_32 = S_MOV_B32 24 + ; GFX90A-NEXT: [[S_MOV_B32_19:%[0-9]+]]:sgpr_32 = S_MOV_B32 25 + ; GFX90A-NEXT: [[S_MOV_B32_20:%[0-9]+]]:sgpr_32 = S_MOV_B32 26 + ; GFX90A-NEXT: [[S_MOV_B32_21:%[0-9]+]]:sgpr_32 = S_MOV_B32 27 + ; GFX90A-NEXT: [[S_MOV_B32_22:%[0-9]+]]:sgpr_32 = S_MOV_B32 28 + ; GFX90A-NEXT: [[S_MOV_B32_23:%[0-9]+]]:sgpr_32 = S_MOV_B32 29 + ; GFX90A-NEXT: [[S_MOV_B32_24:%[0-9]+]]:sgpr_32 = S_MOV_B32 30 + ; GFX90A-NEXT: [[S_MOV_B32_25:%[0-9]+]]:sgpr_32 = S_MOV_B32 31 + ; GFX90A-NEXT: [[S_MOV_B32_26:%[0-9]+]]:sgpr_32 = S_MOV_B32 32 + ; GFX90A-NEXT: [[S_MOV_B32_27:%[0-9]+]]:sgpr_32 = S_MOV_B32 33 + ; GFX90A-NEXT: [[S_MOV_B32_28:%[0-9]+]]:sgpr_32 = S_MOV_B32 34 + ; GFX90A-NEXT: [[S_MOV_B32_29:%[0-9]+]]:sgpr_32 = S_MOV_B32 35 + ; GFX90A-NEXT: [[S_MOV_B32_30:%[0-9]+]]:sgpr_32 = S_MOV_B32 36 + ; GFX90A-NEXT: [[S_MOV_B32_31:%[0-9]+]]:sgpr_32 = S_MOV_B32 37 + ; GFX90A-NEXT: [[S_MOV_B32_32:%[0-9]+]]:sgpr_32 = S_MOV_B32 38 + ; GFX90A-NEXT: [[S_MOV_B32_33:%[0-9]+]]:sgpr_32 = S_MOV_B32 39 + ; GFX90A-NEXT: [[S_MOV_B32_34:%[0-9]+]]:sgpr_32 = S_MOV_B32 40 + ; GFX90A-NEXT: [[S_MOV_B32_35:%[0-9]+]]:sgpr_32 = S_MOV_B32 41 + ; GFX90A-NEXT: [[S_MOV_B32_36:%[0-9]+]]:sgpr_32 = S_MOV_B32 42 + ; GFX90A-NEXT: [[S_MOV_B32_37:%[0-9]+]]:sgpr_32 = S_MOV_B32 43 + ; GFX90A-NEXT: [[S_MOV_B32_38:%[0-9]+]]:sgpr_32 = S_MOV_B32 44 + ; GFX90A-NEXT: [[S_MOV_B32_39:%[0-9]+]]:sgpr_32 = S_MOV_B32 45 + ; GFX90A-NEXT: [[S_MOV_B32_40:%[0-9]+]]:sgpr_32 = S_MOV_B32 46 + ; GFX90A-NEXT: [[S_MOV_B32_41:%[0-9]+]]:sgpr_32 = S_MOV_B32 47 + ; GFX90A-NEXT: [[S_MOV_B32_42:%[0-9]+]]:sgpr_32 = S_MOV_B32 48 + ; GFX90A-NEXT: [[S_MOV_B32_43:%[0-9]+]]:sgpr_32 = S_MOV_B32 49 + ; GFX90A-NEXT: [[S_MOV_B32_44:%[0-9]+]]:sgpr_32 = S_MOV_B32 50 + ; GFX90A-NEXT: [[S_MOV_B32_45:%[0-9]+]]:sgpr_32 = S_MOV_B32 51 + ; GFX90A-NEXT: [[S_MOV_B32_46:%[0-9]+]]:sgpr_32 = S_MOV_B32 52 + ; GFX90A-NEXT: [[S_MOV_B32_47:%[0-9]+]]:sgpr_32 = S_MOV_B32 53 + ; GFX90A-NEXT: [[S_MOV_B32_48:%[0-9]+]]:sgpr_32 = S_MOV_B32 54 + ; GFX90A-NEXT: [[S_MOV_B32_49:%[0-9]+]]:sgpr_32 = S_MOV_B32 55 + ; GFX90A-NEXT: [[S_MOV_B32_50:%[0-9]+]]:sgpr_32 = S_MOV_B32 56 + ; GFX90A-NEXT: [[S_MOV_B32_51:%[0-9]+]]:sgpr_32 = S_MOV_B32 57 + ; GFX90A-NEXT: [[S_MOV_B32_52:%[0-9]+]]:sgpr_32 = S_MOV_B32 58 + ; GFX90A-NEXT: [[S_MOV_B32_53:%[0-9]+]]:sgpr_32 = S_MOV_B32 59 + ; GFX90A-NEXT: [[S_MOV_B32_54:%[0-9]+]]:sgpr_32 = S_MOV_B32 60 + ; GFX90A-NEXT: [[S_MOV_B32_55:%[0-9]+]]:sgpr_32 = S_MOV_B32 61 + ; GFX90A-NEXT: [[S_MOV_B32_56:%[0-9]+]]:sgpr_32 = S_MOV_B32 62 + ; GFX90A-NEXT: [[S_MOV_B32_57:%[0-9]+]]:sgpr_32 = S_MOV_B32 63 + ; GFX90A-NEXT: [[S_MOV_B32_58:%[0-9]+]]:sgpr_32 = S_MOV_B32 64 + ; GFX90A-NEXT: [[S_MOV_B32_59:%[0-9]+]]:sgpr_32 = S_MOV_B32 65 + ; GFX90A-NEXT: [[S_MOV_B32_60:%[0-9]+]]:sgpr_32 = S_MOV_B32 66 + ; GFX90A-NEXT: [[S_MOV_B32_61:%[0-9]+]]:sgpr_32 = S_MOV_B32 67 + ; GFX90A-NEXT: [[S_MOV_B32_62:%[0-9]+]]:sgpr_32 = S_MOV_B32 68 + ; GFX90A-NEXT: [[S_MOV_B32_63:%[0-9]+]]:sgpr_32 = S_MOV_B32 69 + ; GFX90A-NEXT: [[S_MOV_B32_64:%[0-9]+]]:sgpr_32 = S_MOV_B32 70 + ; GFX90A-NEXT: [[S_MOV_B32_65:%[0-9]+]]:sgpr_32 = S_MOV_B32 71 + ; GFX90A-NEXT: [[S_MOV_B32_66:%[0-9]+]]:sgpr_32 = S_MOV_B32 72 + ; GFX90A-NEXT: [[S_MOV_B32_67:%[0-9]+]]:sgpr_32 = S_MOV_B32 73 + ; GFX90A-NEXT: [[S_MOV_B32_68:%[0-9]+]]:sgpr_32 = S_MOV_B32 74 + ; GFX90A-NEXT: [[S_MOV_B32_69:%[0-9]+]]:sgpr_32 = S_MOV_B32 75 + ; GFX90A-NEXT: [[S_MOV_B32_70:%[0-9]+]]:sgpr_32 = S_MOV_B32 76 + ; GFX90A-NEXT: [[S_MOV_B32_71:%[0-9]+]]:sgpr_32 = S_MOV_B32 77 + ; GFX90A-NEXT: [[S_MOV_B32_72:%[0-9]+]]:sgpr_32 = S_MOV_B32 78 + ; GFX90A-NEXT: [[S_MOV_B32_73:%[0-9]+]]:sgpr_32 = S_MOV_B32 79 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.1: - ; GFX90A-NEXT: [[S_MOV_B32_79:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 + ; GFX90A-NEXT: [[S_MOV_B32_74:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 + ; GFX90A-NEXT: [[S_MOV_B32_75:%[0-9]+]]:sgpr_32 = S_MOV_B32 1 + ; GFX90A-NEXT: [[S_MOV_B32_76:%[0-9]+]]:sgpr_32 = S_MOV_B32 2 + ; GFX90A-NEXT: [[S_MOV_B32_77:%[0-9]+]]:sgpr_32 = S_MOV_B32 3 + ; GFX90A-NEXT: [[S_MOV_B32_78:%[0-9]+]]:sgpr_32 = S_MOV_B32 4 + ; GFX90A-NEXT: S_NOP 0, implicit [[S_MOV_B32_74]], implicit [[S_MOV_B32_75]], implicit [[S_MOV_B32_76]], implicit [[S_MOV_B32_77]], implicit [[S_MOV_B32_78]] + ; GFX90A-NEXT: [[S_MOV_B32_79:%[0-9]+]]:sgpr_32 = S_MOV_B32 5 ; GFX90A-NEXT: S_NOP 0, implicit [[S_MOV_B32_79]], implicit [[S_MOV_B32_]], implicit [[S_MOV_B32_1]], implicit [[S_MOV_B32_2]], implicit [[S_MOV_B32_3]] ; GFX90A-NEXT: S_NOP 0, implicit [[S_MOV_B32_4]], implicit [[S_MOV_B32_5]], implicit [[S_MOV_B32_6]], implicit [[S_MOV_B32_7]], implicit [[S_MOV_B32_8]] ; GFX90A-NEXT: S_NOP 0, implicit [[S_MOV_B32_9]], implicit [[S_MOV_B32_10]], implicit [[S_MOV_B32_11]], implicit [[S_MOV_B32_12]], implicit [[S_MOV_B32_13]] @@ -238,7 +239,6 @@ body: | ; GFX90A-NEXT: S_NOP 0, implicit [[S_MOV_B32_59]], implicit [[S_MOV_B32_60]], implicit [[S_MOV_B32_61]], implicit [[S_MOV_B32_62]], implicit [[S_MOV_B32_63]] ; GFX90A-NEXT: S_NOP 0, implicit [[S_MOV_B32_64]], implicit [[S_MOV_B32_65]], implicit [[S_MOV_B32_66]], implicit [[S_MOV_B32_67]], implicit [[S_MOV_B32_68]] ; GFX90A-NEXT: S_NOP 0, implicit [[S_MOV_B32_69]], implicit [[S_MOV_B32_70]], implicit [[S_MOV_B32_71]], implicit [[S_MOV_B32_72]], implicit [[S_MOV_B32_73]] - ; GFX90A-NEXT: S_NOP 0, implicit [[S_MOV_B32_74]], implicit [[S_MOV_B32_75]], implicit [[S_MOV_B32_76]], implicit [[S_MOV_B32_77]], implicit [[S_MOV_B32_78]] ; GFX90A-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1 @@ -796,9 +796,6 @@ body: | ; GFX908-NEXT: [[DEF26:%[0-9]+]]:agpr_32 = IMPLICIT_DEF ; GFX908-NEXT: [[DEF27:%[0-9]+]]:agpr_32 = IMPLICIT_DEF ; GFX908-NEXT: [[DEF28:%[0-9]+]]:agpr_32 = IMPLICIT_DEF - ; GFX908-NEXT: [[DEF29:%[0-9]+]]:agpr_32 = IMPLICIT_DEF - ; GFX908-NEXT: [[DEF30:%[0-9]+]]:agpr_32 = IMPLICIT_DEF - ; GFX908-NEXT: [[DEF31:%[0-9]+]]:agpr_32 = IMPLICIT_DEF ; GFX908-NEXT: [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0 @@ -839,15 +836,18 @@ body: | ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]] ; GFX908-NEXT: [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]] + ; GFX908-NEXT: [[DEF29:%[0-9]+]]:agpr_32 = IMPLICIT_DEF + ; GFX908-NEXT: [[DEF30:%[0-9]+]]:agpr_32 = IMPLICIT_DEF + ; GFX908-NEXT: [[DEF31:%[0-9]+]]:agpr_32 = IMPLICIT_DEF + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_29]], implicit [[V_CVT_I32_F64_e32_30]], implicit [[DEF29]], implicit [[DEF30]], implicit [[DEF31]] ; GFX908-NEXT: [[DEF32:%[0-9]+]]:agpr_32 = IMPLICIT_DEF - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_29]], implicit [[V_CVT_I32_F64_e32_30]], implicit [[DEF32]], implicit [[DEF]], implicit [[DEF1]] - ; GFX908-NEXT: S_NOP 0, implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]], implicit [[DEF5]], implicit [[DEF6]] - ; GFX908-NEXT: S_NOP 0, implicit [[DEF7]], implicit [[DEF8]], implicit [[DEF9]], implicit [[DEF10]], implicit [[DEF11]] - ; GFX908-NEXT: S_NOP 0, implicit [[DEF12]], implicit [[DEF13]], implicit [[DEF14]], implicit [[DEF15]], implicit [[DEF16]] - ; GFX908-NEXT: S_NOP 0, implicit [[DEF17]], implicit [[DEF18]], implicit [[DEF19]], implicit [[DEF20]], implicit [[DEF21]] - ; GFX908-NEXT: S_NOP 0, implicit [[DEF22]], implicit [[DEF23]], implicit [[DEF24]], implicit [[DEF25]], implicit [[DEF26]] - ; GFX908-NEXT: S_NOP 0, implicit [[DEF27]], implicit [[DEF28]], implicit [[DEF29]], implicit [[DEF30]], implicit [[V_CVT_I32_F64_e32_31]] - ; GFX908-NEXT: S_NOP 0, implicit [[DEF31]] + ; GFX908-NEXT: S_NOP 0, implicit [[DEF32]], implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]] + ; GFX908-NEXT: S_NOP 0, implicit [[DEF4]], implicit [[DEF5]], implicit [[DEF6]], implicit [[DEF7]], implicit [[DEF8]] + ; GFX908-NEXT: S_NOP 0, implicit [[DEF9]], implicit [[DEF10]], implicit [[DEF11]], implicit [[DEF12]], implicit [[DEF13]] + ; GFX908-NEXT: S_NOP 0, implicit [[DEF14]], implicit [[DEF15]], implicit [[DEF16]], implicit [[DEF17]], implicit [[DEF18]] + ; GFX908-NEXT: S_NOP 0, implicit [[DEF19]], implicit [[DEF20]], implicit [[DEF21]], implicit [[DEF22]], implicit [[DEF23]] + ; GFX908-NEXT: S_NOP 0, implicit [[DEF24]], implicit [[DEF25]], implicit [[DEF26]], implicit [[DEF27]], implicit [[V_CVT_I32_F64_e32_31]] + ; GFX908-NEXT: S_NOP 0, implicit [[DEF28]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: reduce_arch_and_acc_vgrp_spill @@ -910,9 +910,6 @@ body: | ; GFX90A-NEXT: [[DEF26:%[0-9]+]]:agpr_32 = IMPLICIT_DEF ; GFX90A-NEXT: [[DEF27:%[0-9]+]]:agpr_32 = IMPLICIT_DEF ; GFX90A-NEXT: [[DEF28:%[0-9]+]]:agpr_32 = IMPLICIT_DEF - ; GFX90A-NEXT: [[DEF29:%[0-9]+]]:agpr_32 = IMPLICIT_DEF - ; GFX90A-NEXT: [[DEF30:%[0-9]+]]:agpr_32 = IMPLICIT_DEF - ; GFX90A-NEXT: [[DEF31:%[0-9]+]]:agpr_32 = IMPLICIT_DEF ; GFX90A-NEXT: [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode ; GFX90A-NEXT: [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode ; GFX90A-NEXT: [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode @@ -927,15 +924,18 @@ body: | ; GFX90A-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]] ; GFX90A-NEXT: [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode ; GFX90A-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]] + ; GFX90A-NEXT: [[DEF29:%[0-9]+]]:agpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF30:%[0-9]+]]:agpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF31:%[0-9]+]]:agpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_29]], implicit [[V_CVT_I32_F64_e32_30]], implicit [[DEF29]], implicit [[DEF30]], implicit [[DEF31]] ; GFX90A-NEXT: [[DEF32:%[0-9]+]]:agpr_32 = IMPLICIT_DEF - ; GFX90A-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_29]], implicit [[V_CVT_I32_F64_e32_30]], implicit [[DEF32]], implicit [[DEF]], implicit [[DEF1]] - ; GFX90A-NEXT: S_NOP 0, implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]], implicit [[DEF5]], implicit [[DEF6]] - ; GFX90A-NEXT: S_NOP 0, implicit [[DEF7]], implicit [[DEF8]], implicit [[DEF9]], implicit [[DEF10]], implicit [[DEF11]] - ; GFX90A-NEXT: S_NOP 0, implicit [[DEF12]], implicit [[DEF13]], implicit [[DEF14]], implicit [[DEF15]], implicit [[DEF16]] - ; GFX90A-NEXT: S_NOP 0, implicit [[DEF17]], implicit [[DEF18]], implicit [[DEF19]], implicit [[DEF20]], implicit [[DEF21]] - ; GFX90A-NEXT: S_NOP 0, implicit [[DEF22]], implicit [[DEF23]], implicit [[DEF24]], implicit [[DEF25]], implicit [[DEF26]] - ; GFX90A-NEXT: S_NOP 0, implicit [[DEF27]], implicit [[DEF28]], implicit [[DEF29]], implicit [[DEF30]], implicit [[V_CVT_I32_F64_e32_31]] - ; GFX90A-NEXT: S_NOP 0, implicit [[DEF31]] + ; GFX90A-NEXT: S_NOP 0, implicit [[DEF32]], implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]] + ; GFX90A-NEXT: S_NOP 0, implicit [[DEF4]], implicit [[DEF5]], implicit [[DEF6]], implicit [[DEF7]], implicit [[DEF8]] + ; GFX90A-NEXT: S_NOP 0, implicit [[DEF9]], implicit [[DEF10]], implicit [[DEF11]], implicit [[DEF12]], implicit [[DEF13]] + ; GFX90A-NEXT: S_NOP 0, implicit [[DEF14]], implicit [[DEF15]], implicit [[DEF16]], implicit [[DEF17]], implicit [[DEF18]] + ; GFX90A-NEXT: S_NOP 0, implicit [[DEF19]], implicit [[DEF20]], implicit [[DEF21]], implicit [[DEF22]], implicit [[DEF23]] + ; GFX90A-NEXT: S_NOP 0, implicit [[DEF24]], implicit [[DEF25]], implicit [[DEF26]], implicit [[DEF27]], implicit [[V_CVT_I32_F64_e32_31]] + ; GFX90A-NEXT: S_NOP 0, implicit [[DEF28]] ; GFX90A-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1 @@ -2174,6 +2174,8 @@ body: | ; GFX908-NEXT: [[DEF243:%[0-9]+]]:agpr_32 = IMPLICIT_DEF ; GFX908-NEXT: [[DEF244:%[0-9]+]]:agpr_32 = IMPLICIT_DEF ; GFX908-NEXT: [[DEF245:%[0-9]+]]:agpr_32 = IMPLICIT_DEF + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: bb.1: ; GFX908-NEXT: [[DEF246:%[0-9]+]]:agpr_32 = IMPLICIT_DEF ; GFX908-NEXT: [[DEF247:%[0-9]+]]:agpr_32 = IMPLICIT_DEF ; GFX908-NEXT: [[DEF248:%[0-9]+]]:agpr_32 = IMPLICIT_DEF @@ -2184,8 +2186,7 @@ body: | ; GFX908-NEXT: [[DEF253:%[0-9]+]]:agpr_32 = IMPLICIT_DEF ; GFX908-NEXT: [[DEF254:%[0-9]+]]:agpr_32 = IMPLICIT_DEF ; GFX908-NEXT: [[DEF255:%[0-9]+]]:agpr_32 = IMPLICIT_DEF - ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: bb.1: + ; GFX908-NEXT: S_NOP 0, implicit [[DEF246]], implicit [[DEF247]], implicit [[DEF248]], implicit [[DEF249]], implicit [[DEF250]], implicit [[DEF251]], implicit [[DEF252]], implicit [[DEF253]], implicit [[DEF254]], implicit [[DEF255]] ; GFX908-NEXT: [[DEF256:%[0-9]+]]:agpr_32 = IMPLICIT_DEF ; GFX908-NEXT: S_NOP 0, implicit [[DEF256]], implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]], implicit [[DEF5]], implicit [[DEF6]], implicit [[DEF7]], implicit [[DEF8]] ; GFX908-NEXT: S_NOP 0, implicit [[DEF9]], implicit [[DEF10]], implicit [[DEF11]], implicit [[DEF12]], implicit [[DEF13]], implicit [[DEF14]], implicit [[DEF15]], implicit [[DEF16]], implicit [[DEF17]], implicit [[DEF18]] @@ -2211,8 +2212,7 @@ body: | ; GFX908-NEXT: S_NOP 0, implicit [[DEF209]], implicit [[DEF210]], implicit [[DEF211]], implicit [[DEF212]], implicit [[DEF213]], implicit [[DEF214]], implicit [[DEF215]], implicit [[DEF216]], implicit [[DEF217]], implicit [[DEF218]] ; GFX908-NEXT: S_NOP 0, implicit [[DEF219]], implicit [[DEF220]], implicit [[DEF221]], implicit [[DEF222]], implicit [[DEF223]], implicit [[DEF224]], implicit [[DEF225]], implicit [[DEF226]], implicit [[DEF227]], implicit [[DEF228]] ; GFX908-NEXT: S_NOP 0, implicit [[DEF229]], implicit [[DEF230]], implicit [[DEF231]], implicit [[DEF232]], implicit [[DEF233]], implicit [[DEF234]], implicit [[DEF235]], implicit [[DEF236]], implicit [[DEF237]], implicit [[DEF238]] - ; GFX908-NEXT: S_NOP 0, implicit [[DEF239]], implicit [[DEF240]], implicit [[DEF241]], implicit [[DEF242]], implicit [[DEF243]], implicit [[DEF244]], implicit [[DEF245]], implicit [[DEF246]], implicit [[DEF247]], implicit [[DEF248]] - ; GFX908-NEXT: S_NOP 0, implicit [[DEF249]], implicit [[DEF250]], implicit [[DEF251]], implicit [[DEF252]], implicit [[DEF253]], implicit [[DEF254]], implicit [[DEF255]], implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]] + ; GFX908-NEXT: S_NOP 0, implicit [[DEF239]], implicit [[DEF240]], implicit [[DEF241]], implicit [[DEF242]], implicit [[DEF243]], implicit [[DEF244]], implicit [[DEF245]], implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: reduce_spill_agpr_above_addressable_limit @@ -2465,6 +2465,10 @@ body: | ; GFX90A-NEXT: [[DEF243:%[0-9]+]]:agpr_32 = IMPLICIT_DEF ; GFX90A-NEXT: [[DEF244:%[0-9]+]]:agpr_32 = IMPLICIT_DEF ; GFX90A-NEXT: [[DEF245:%[0-9]+]]:agpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 257, implicit $exec, implicit $mode + ; GFX90A-NEXT: [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 258, implicit $exec, implicit $mode + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: bb.1: ; GFX90A-NEXT: [[DEF246:%[0-9]+]]:agpr_32 = IMPLICIT_DEF ; GFX90A-NEXT: [[DEF247:%[0-9]+]]:agpr_32 = IMPLICIT_DEF ; GFX90A-NEXT: [[DEF248:%[0-9]+]]:agpr_32 = IMPLICIT_DEF @@ -2475,10 +2479,7 @@ body: | ; GFX90A-NEXT: [[DEF253:%[0-9]+]]:agpr_32 = IMPLICIT_DEF ; GFX90A-NEXT: [[DEF254:%[0-9]+]]:agpr_32 = IMPLICIT_DEF ; GFX90A-NEXT: [[DEF255:%[0-9]+]]:agpr_32 = IMPLICIT_DEF - ; GFX90A-NEXT: [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 257, implicit $exec, implicit $mode - ; GFX90A-NEXT: [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 258, implicit $exec, implicit $mode - ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.1: + ; GFX90A-NEXT: S_NOP 0, implicit [[DEF246]], implicit [[DEF247]], implicit [[DEF248]], implicit [[DEF249]], implicit [[DEF250]], implicit [[DEF251]], implicit [[DEF252]], implicit [[DEF253]], implicit [[DEF254]], implicit [[DEF255]] ; GFX90A-NEXT: [[DEF256:%[0-9]+]]:agpr_32 = IMPLICIT_DEF ; GFX90A-NEXT: S_NOP 0, implicit [[DEF256]], implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]], implicit [[DEF5]], implicit [[DEF6]], implicit [[DEF7]], implicit [[DEF8]] ; GFX90A-NEXT: S_NOP 0, implicit [[DEF9]], implicit [[DEF10]], implicit [[DEF11]], implicit [[DEF12]], implicit [[DEF13]], implicit [[DEF14]], implicit [[DEF15]], implicit [[DEF16]], implicit [[DEF17]], implicit [[DEF18]] @@ -2504,8 +2505,7 @@ body: | ; GFX90A-NEXT: S_NOP 0, implicit [[DEF209]], implicit [[DEF210]], implicit [[DEF211]], implicit [[DEF212]], implicit [[DEF213]], implicit [[DEF214]], implicit [[DEF215]], implicit [[DEF216]], implicit [[DEF217]], implicit [[DEF218]] ; GFX90A-NEXT: S_NOP 0, implicit [[DEF219]], implicit [[DEF220]], implicit [[DEF221]], implicit [[DEF222]], implicit [[DEF223]], implicit [[DEF224]], implicit [[DEF225]], implicit [[DEF226]], implicit [[DEF227]], implicit [[DEF228]] ; GFX90A-NEXT: S_NOP 0, implicit [[DEF229]], implicit [[DEF230]], implicit [[DEF231]], implicit [[DEF232]], implicit [[DEF233]], implicit [[DEF234]], implicit [[DEF235]], implicit [[DEF236]], implicit [[DEF237]], implicit [[DEF238]] - ; GFX90A-NEXT: S_NOP 0, implicit [[DEF239]], implicit [[DEF240]], implicit [[DEF241]], implicit [[DEF242]], implicit [[DEF243]], implicit [[DEF244]], implicit [[DEF245]], implicit [[DEF246]], implicit [[DEF247]], implicit [[DEF248]] - ; GFX90A-NEXT: S_NOP 0, implicit [[DEF249]], implicit [[DEF250]], implicit [[DEF251]], implicit [[DEF252]], implicit [[DEF253]], implicit [[DEF254]], implicit [[DEF255]], implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]] + ; GFX90A-NEXT: S_NOP 0, implicit [[DEF239]], implicit [[DEF240]], implicit [[DEF241]], implicit [[DEF242]], implicit [[DEF243]], implicit [[DEF244]], implicit [[DEF245]], implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]] ; GFX90A-NEXT: S_ENDPGM 0 bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir index c117473581746..7d27716c311a4 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir +++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir @@ -2104,9 +2104,13 @@ body: | ; GFX908-NEXT: [[S_MOV_B32_58:%[0-9]+]]:sgpr_32 = S_MOV_B32 69 ; GFX908-NEXT: [[S_MOV_B32_59:%[0-9]+]]:sgpr_32 = S_MOV_B32 70 ; GFX908-NEXT: [[S_MOV_B32_60:%[0-9]+]]:sgpr_32 = S_MOV_B32 71 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[S_MOV_B32_61:%[0-9]+]]:sgpr_32 = S_MOV_B32 72 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[S_MOV_B32_62:%[0-9]+]]:sgpr_32 = S_MOV_B32 73 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[S_MOV_B32_63:%[0-9]+]]:sgpr_32 = S_MOV_B32 74 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode ; GFX908-NEXT: [[S_MOV_B32_64:%[0-9]+]]:sgpr_32 = S_MOV_B32 75 ; GFX908-NEXT: [[S_MOV_B32_65:%[0-9]+]]:sgpr_32 = S_MOV_B32 76 ; GFX908-NEXT: [[S_MOV_B32_66:%[0-9]+]]:sgpr_32 = S_MOV_B32 77 @@ -2116,11 +2120,7 @@ body: | ; GFX908-NEXT: [[S_MOV_B32_70:%[0-9]+]]:sgpr_32 = S_MOV_B32 81 ; GFX908-NEXT: [[S_MOV_B32_71:%[0-9]+]]:sgpr_32 = S_MOV_B32 82 ; GFX908-NEXT: [[S_MOV_B32_72:%[0-9]+]]:sgpr_32 = S_MOV_B32 83 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[S_MOV_B32_73:%[0-9]+]]:sgpr_32 = S_MOV_B32 84 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) @@ -2468,9 +2468,9 @@ body: | ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.3(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] ; GFX908-NEXT: S_NOP 0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: ; GFX908-NEXT: successors: %bb.4(0x80000000) @@ -6329,7 +6329,7 @@ body: | ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: - ; GFX908-NEXT: DBG_VALUE %23, 0, 0 + ; GFX908-NEXT: DBG_VALUE %23:vgpr_32, 0, 0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]] @@ -7135,12 +7135,12 @@ body: | S_ENDPGM 0 ... --- -name: test_rollback_remats_emptydefregion +name: test_rollback_remats_emptydefregion_block tracksRegLiveness: true machineFunctionInfo: isEntryFunction: true body: | - ; GFX908-LABEL: name: test_rollback_remats_emptydefregion + ; GFX908-LABEL: name: test_rollback_remats_emptydefregion_block ; GFX908: bb.0: ; GFX908-NEXT: successors: %bb.1(0x80000000) ; GFX908-NEXT: {{ $}} @@ -7269,6 +7269,142 @@ body: | S_ENDPGM 0 ... --- +name: test_rollback_remats_emptydefregion_barrier +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + ; GFX908-LABEL: name: test_rollback_remats_emptydefregion_barrier + ; GFX908: bb.0: + ; GFX908-NEXT: successors: %bb.1(0x80000000) + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: bb.1: + ; GFX908-NEXT: successors: %bb.2(0x80000000) + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode + ; GFX908-NEXT: SCHED_BARRIER 0 + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]] + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: bb.2: + ; GFX908-NEXT: successors: %bb.3(0x80000000) + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]] + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]] + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]] + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]] + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]] + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_33]] + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: bb.3: + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]] + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]] + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]] + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]] + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]] + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]] + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]] + ; GFX908-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.1 + + %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0 + %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0 + %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0 + %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0 + %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0 + %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0 + %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0 + %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0 + %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0 + %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0 + %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0 + %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0 + %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0 + %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0 + %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0 + %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0 + %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0 + %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0 + %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0 + %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0 + %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 + %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 + %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 + %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0 + %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 + %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0 + %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0 + %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0 + %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0 + %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0 + %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0 + %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0 + + bb.1: + successors: %bb.2 + + %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode + %33:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode + SCHED_BARRIER 0 + S_NOP 0, implicit %0, implicit %1, implicit %2, implicit %3, implicit %4 + + bb.2: + successors: %bb.3 + + S_NOP 0, implicit %5, implicit %6, implicit %7, implicit %8, implicit %9 + S_NOP 0, implicit %10, implicit %11, implicit %12, implicit %13, implicit %14 + S_NOP 0, implicit %15, implicit %16, implicit %17, implicit %18, implicit %19 + S_NOP 0, implicit %20, implicit %21, implicit %22, implicit %23, implicit %24 + S_NOP 0, implicit %25, implicit %26, implicit %27, implicit %28, implicit %29 + S_NOP 0, implicit %30, implicit %31, implicit %32, implicit %33 + + bb.3: + + S_NOP 0, implicit %0, implicit %1, implicit %2, implicit %3, implicit %4, + S_NOP 0, implicit %5, implicit %6, implicit %7, implicit %8, implicit %9 + S_NOP 0, implicit %10, implicit %11, implicit %12, implicit %13, implicit %14 + S_NOP 0, implicit %15, implicit %16, implicit %17, implicit %18, implicit %19 + S_NOP 0, implicit %20, implicit %21, implicit %22, implicit %23, implicit %24 + S_NOP 0, implicit %25, implicit %26, implicit %27, implicit %28, implicit %29 + S_NOP 0, implicit %30, implicit %31 + + S_ENDPGM 0 +... +--- name: test_occ_8_physreg_use tracksRegLiveness: true machineFunctionInfo: diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll index 4bb653848cbf0..87aeb5dff490b 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll @@ -500,8 +500,8 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 { ; GFX908-NEXT: v_accvgpr_write_b32 a3, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a2, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a0, 0 -; GFX908-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX908-NEXT: s_mov_b32 s0, 16 +; GFX908-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX908-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX908-NEXT: .LBB2_1: ; %for.cond.preheader ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/uniform-alignbit.ll b/llvm/test/CodeGen/AMDGPU/uniform-alignbit.ll index fe8c90ee7b686..fa0342bed513b 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-alignbit.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-alignbit.ll @@ -35,4 +35,4 @@ entry: declare void @llvm.amdgcn.sched.barrier(i32 immarg) #0 ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(read) -declare <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32>, i32, i32, i32 immarg) #1 \ No newline at end of file +declare <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32>, i32, i32, i32 immarg) #1