diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 71f11bf89368f..368a469d00e37 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -1709,6 +1709,20 @@ The AMDGPU backend supports the following LLVM IR attributes. as hidden. Hidden arguments are managed by the compiler and are not part of the explicit arguments supplied by the user. + "amdgpu-sgpr-hazard-wait" Disabled SGPR hazard wait insertion if set to 0. + Exists for testing performance impact of SGPR hazard waits only. + + "amdgpu-sgpr-hazard-boundary-cull" Enable insertion of SGPR hazard cull sequences at function call boundaries. + Cull sequence reduces future hazard waits, but has a performance cost. + + "amdgpu-sgpr-hazard-mem-wait-cull" Enable insertion of SGPR hazard cull sequences before memory waits. + Cull sequence reduces future hazard waits, but has a performance cost. + Attempt to amortize cost by overlapping with memory accesses. + + "amdgpu-sgpr-hazard-mem-wait-cull-threshold" + Sets the number of active SGPR hazards that must be present before + inserting a cull sequence at a memory wait. + ======================================= ========================================================== Calling Conventions diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 3d5a44a3623a0..31656c98ccd36 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -463,6 +463,9 @@ void initializeAMDGPUSetWavePriorityPass(PassRegistry &); void initializeGCNRewritePartialRegUsesPass(llvm::PassRegistry &); extern char &GCNRewritePartialRegUsesID; +void initializeAMDGPUWaitSGPRHazardsLegacyPass(PassRegistry &); +extern char &AMDGPUWaitSGPRHazardsLegacyID; + namespace AMDGPU { enum TargetIndex { TI_CONSTDATA_START, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 5b2081c8fa213..96062b30fc012 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -29,6 +29,7 @@ #include "AMDGPUTargetObjectFile.h" #include "AMDGPUTargetTransformInfo.h" #include "AMDGPUUnifyDivergentExitNodes.h" +#include "AMDGPUWaitSGPRHazards.h" #include "GCNDPPCombine.h" #include "GCNIterativeScheduler.h" #include "GCNSchedStrategy.h" @@ -549,6 +550,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeGCNRewritePartialRegUsesPass(*PR); initializeGCNRegPressurePrinterPass(*PR); initializeAMDGPUPreloadKernArgPrologLegacyPass(*PR); + initializeAMDGPUWaitSGPRHazardsLegacyPass(*PR); } static std::unique_ptr createTLOF(const Triple &TT) { @@ -1678,6 +1680,8 @@ void GCNPassConfig::addPreEmitPass() { // cases. addPass(&PostRAHazardRecognizerID); + addPass(&AMDGPUWaitSGPRHazardsLegacyID); + if (isPassEnabled(EnableInsertDelayAlu, CodeGenOptLevel::Less)) addPass(&AMDGPUInsertDelayAluID); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp b/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp new file mode 100644 index 0000000000000..e70d6aab306fe --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp @@ -0,0 +1,517 @@ +//===- AMDGPUWaitSGPRHazards.cpp - Insert waits for SGPR read hazards -----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Insert s_wait_alu instructions to mitigate SGPR read hazards on GFX12. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUWaitSGPRHazards.h" +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" +#include "llvm/ADT/SetVector.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-wait-sgpr-hazards" + +static cl::opt GlobalEnableSGPRHazardWaits( + "amdgpu-sgpr-hazard-wait", cl::init(true), cl::Hidden, + cl::desc("Enable required s_wait_alu on SGPR hazards")); + +static cl::opt GlobalCullSGPRHazardsOnFunctionBoundary( + "amdgpu-sgpr-hazard-boundary-cull", cl::init(false), cl::Hidden, + cl::desc("Cull hazards on function boundaries")); + +static cl::opt + GlobalCullSGPRHazardsAtMemWait("amdgpu-sgpr-hazard-mem-wait-cull", + cl::init(false), cl::Hidden, + cl::desc("Cull hazards on memory waits")); + +static cl::opt GlobalCullSGPRHazardsMemWaitThreshold( + "amdgpu-sgpr-hazard-mem-wait-cull-threshold", cl::init(8), cl::Hidden, + cl::desc("Number of tracked SGPRs before initiating hazard cull on memory " + "wait")); + +namespace { + +class AMDGPUWaitSGPRHazards { +public: + const SIInstrInfo *TII; + const SIRegisterInfo *TRI; + const MachineRegisterInfo *MRI; + unsigned DsNopCount; + + bool EnableSGPRHazardWaits; + bool CullSGPRHazardsOnFunctionBoundary; + bool CullSGPRHazardsAtMemWait; + unsigned CullSGPRHazardsMemWaitThreshold; + + AMDGPUWaitSGPRHazards() {} + + // Return the numeric ID 0-127 for a given SGPR. + static std::optional sgprNumber(Register Reg, + const SIRegisterInfo &TRI) { + switch (Reg) { + case AMDGPU::M0: + case AMDGPU::EXEC: + case AMDGPU::EXEC_LO: + case AMDGPU::EXEC_HI: + case AMDGPU::SGPR_NULL: + case AMDGPU::SGPR_NULL64: + return {}; + default: + break; + } + unsigned RegN = TRI.getHWRegIndex(Reg); + if (RegN > 127) + return {}; + return RegN; + } + + static inline bool isVCC(Register Reg) { + return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI; + } + + // Adjust global offsets for instructions bundled with S_GETPC_B64 after + // insertion of a new instruction. + static void updateGetPCBundle(MachineInstr *NewMI) { + if (!NewMI->isBundled()) + return; + + // Find start of bundle. + auto I = NewMI->getIterator(); + while (I->isBundledWithPred()) + I--; + if (I->isBundle()) + I++; + + // Bail if this is not an S_GETPC bundle. + if (I->getOpcode() != AMDGPU::S_GETPC_B64) + return; + + // Update offsets of any references in the bundle. + const unsigned NewBytes = 4; + assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && + "Unexpected instruction insertion in bundle"); + auto NextMI = std::next(NewMI->getIterator()); + auto End = NewMI->getParent()->end(); + while (NextMI != End && NextMI->isBundledWithPred()) { + for (auto &Operand : NextMI->operands()) { + if (Operand.isGlobal()) + Operand.setOffset(Operand.getOffset() + NewBytes); + } + NextMI++; + } + } + + struct HazardState { + static constexpr unsigned None = 0; + static constexpr unsigned SALU = (1 << 0); + static constexpr unsigned VALU = (1 << 1); + + std::bitset<64> Tracked; // SGPR banks ever read by VALU + std::bitset<128> SALUHazards; // SGPRs with uncommitted values from SALU + std::bitset<128> VALUHazards; // SGPRs with uncommitted values from VALU + unsigned VCCHazard = None; // Source of current VCC writes + bool ActiveFlat = false; // Has unwaited flat instructions + + bool merge(const HazardState &RHS) { + HazardState Orig(*this); + *this |= RHS; + return (*this != Orig); + } + + bool operator==(const HazardState &RHS) const { + return Tracked == RHS.Tracked && SALUHazards == RHS.SALUHazards && + VALUHazards == RHS.VALUHazards && VCCHazard == RHS.VCCHazard && + ActiveFlat == RHS.ActiveFlat; + } + + bool operator!=(const HazardState &RHS) const { return !(*this == RHS); } + + void operator|=(const HazardState &RHS) { + Tracked |= RHS.Tracked; + SALUHazards |= RHS.SALUHazards; + VALUHazards |= RHS.VALUHazards; + VCCHazard |= RHS.VCCHazard; + ActiveFlat |= RHS.ActiveFlat; + } + }; + + struct BlockHazardState { + HazardState In; + HazardState Out; + }; + + DenseMap BlockState; + + static constexpr unsigned WAVE32_NOPS = 4; + static constexpr unsigned WAVE64_NOPS = 8; + + void insertHazardCull(MachineBasicBlock &MBB, + MachineBasicBlock::instr_iterator &MI) { + assert(!MI->isBundled()); + unsigned Count = DsNopCount; + while (Count--) + BuildMI(MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::DS_NOP)); + } + + bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) { + enum { WA_VALU = 0x1, WA_SALU = 0x2, WA_VCC = 0x4 }; + + HazardState State = BlockState[&MBB].In; + SmallSet SeenRegs; + bool Emitted = false; + unsigned DsNops = 0; + + for (MachineBasicBlock::instr_iterator MI = MBB.instr_begin(), + E = MBB.instr_end(); + MI != E; ++MI) { + if (MI->isMetaInstruction()) + continue; + + // Clear tracked SGPRs if sufficient DS_NOPs occur + if (MI->getOpcode() == AMDGPU::DS_NOP) { + if (++DsNops >= DsNopCount) + State.Tracked.reset(); + continue; + } + DsNops = 0; + + // Snoop FLAT instructions to avoid adding culls before scratch/lds loads. + // Culls could be disproportionate in cost to load time. + if (SIInstrInfo::isFLAT(*MI) && !SIInstrInfo::isFLATGlobal(*MI)) + State.ActiveFlat = true; + + // SMEM or VMEM clears hazards + if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isSMRD(*MI)) { + State.VCCHazard = HazardState::None; + State.SALUHazards.reset(); + State.VALUHazards.reset(); + continue; + } + + // Existing S_WAITALU can clear hazards + if (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) { + unsigned int Mask = MI->getOperand(0).getImm(); + if (AMDGPU::DepCtr::decodeFieldVaVcc(Mask) == 0) + State.VCCHazard &= ~HazardState::VALU; + if (AMDGPU::DepCtr::decodeFieldSaSdst(Mask) == 0) { + State.SALUHazards.reset(); + State.VCCHazard &= ~HazardState::SALU; + } + if (AMDGPU::DepCtr::decodeFieldVaSdst(Mask) == 0) + State.VALUHazards.reset(); + continue; + } + + // Snoop counter waits to insert culls + if (CullSGPRHazardsAtMemWait && + (MI->getOpcode() == AMDGPU::S_WAIT_LOADCNT || + MI->getOpcode() == AMDGPU::S_WAIT_SAMPLECNT || + MI->getOpcode() == AMDGPU::S_WAIT_BVHCNT) && + (MI->getOperand(0).isImm() && MI->getOperand(0).getImm() == 0) && + (State.Tracked.count() >= CullSGPRHazardsMemWaitThreshold)) { + if (MI->getOpcode() == AMDGPU::S_WAIT_LOADCNT && State.ActiveFlat) { + State.ActiveFlat = false; + } else { + State.Tracked.reset(); + if (Emit) + insertHazardCull(MBB, MI); + continue; + } + } + + // Process only VALUs and SALUs + bool IsVALU = SIInstrInfo::isVALU(*MI); + bool IsSALU = SIInstrInfo::isSALU(*MI); + if (!IsVALU && !IsSALU) + continue; + + unsigned Wait = 0; + + auto processOperand = [&](const MachineOperand &Op, bool IsUse) { + if (!Op.isReg()) + return; + Register Reg = Op.getReg(); + assert(!Op.getSubReg()); + if (!TRI->isSGPRReg(*MRI, Reg)) + return; + + // Only visit each register once + if (!SeenRegs.insert(Reg).second) + return; + + auto RegNumber = sgprNumber(Reg, *TRI); + if (!RegNumber) + return; + + // Track SGPRs by pair -- numeric ID of an 64b SGPR pair. + // i.e. SGPR0 = SGPR0_SGPR1 = 0, SGPR3 = SGPR2_SGPR3 = 1, etc + unsigned RegN = *RegNumber; + unsigned PairN = (RegN >> 1) & 0x3f; + + // Read/write of untracked register is safe; but must record any new + // reads. + if (!State.Tracked[PairN]) { + if (IsVALU && IsUse) + State.Tracked.set(PairN); + return; + } + + uint8_t SGPRCount = + AMDGPU::getRegBitWidth(*TRI->getRegClassForReg(*MRI, Reg)) / 32; + + if (IsUse) { + // SALU reading SGPR clears VALU hazards + if (IsSALU) { + if (isVCC(Reg)) { + if (State.VCCHazard & HazardState::VALU) + State.VCCHazard = HazardState::None; + } else { + State.VALUHazards.reset(); + } + } + // Compute required waits + for (uint8_t RegIdx = 0; RegIdx < SGPRCount; ++RegIdx) { + Wait |= State.SALUHazards[RegN + RegIdx] ? WA_SALU : 0; + Wait |= IsVALU && State.VALUHazards[RegN + RegIdx] ? WA_VALU : 0; + } + if (isVCC(Reg) && State.VCCHazard) { + // Note: it's possible for both SALU and VALU to exist if VCC + // was updated differently by merged predecessors. + if (State.VCCHazard & HazardState::SALU) + Wait |= WA_SALU; + if (State.VCCHazard & HazardState::VALU) + Wait |= WA_VCC; + } + } else { + // Update hazards + if (isVCC(Reg)) { + State.VCCHazard = IsSALU ? HazardState::SALU : HazardState::VALU; + } else { + for (uint8_t RegIdx = 0; RegIdx < SGPRCount; ++RegIdx) { + if (IsSALU) + State.SALUHazards.set(RegN + RegIdx); + else + State.VALUHazards.set(RegN + RegIdx); + } + } + } + }; + + const bool IsSetPC = + (MI->isCall() || MI->isReturn() || MI->isIndirectBranch()) && + MI->getOpcode() != AMDGPU::S_ENDPGM && + MI->getOpcode() != AMDGPU::S_ENDPGM_SAVED; + + // Only consider implicit VCC specified by instruction descriptor. + const bool HasImplicitVCC = + llvm::any_of(MI->getDesc().implicit_uses(), + [](MCPhysReg Reg) { return isVCC(Reg); }) || + llvm::any_of(MI->getDesc().implicit_defs(), + [](MCPhysReg Reg) { return isVCC(Reg); }); + + if (IsSetPC) { + // All SGPR writes before a call/return must be flushed as the + // callee/caller will not will not see the hazard chain. + if (State.VCCHazard & HazardState::VALU) + Wait |= WA_VCC; + if (State.SALUHazards.any() || (State.VCCHazard & HazardState::SALU)) + Wait |= WA_SALU; + if (State.VALUHazards.any()) + Wait |= WA_VALU; + if (CullSGPRHazardsOnFunctionBoundary && State.Tracked.any()) { + State.Tracked.reset(); + if (Emit) + insertHazardCull(MBB, MI); + } + } else { + // Process uses to determine required wait. + SeenRegs.clear(); + for (const MachineOperand &Op : MI->all_uses()) { + if (Op.isImplicit() && + (!HasImplicitVCC || !Op.isReg() || !isVCC(Op.getReg()))) + continue; + processOperand(Op, true); + } + } + + // Apply wait + if (Wait) { + unsigned Mask = 0xffff; + if (Wait & WA_VCC) { + State.VCCHazard &= ~HazardState::VALU; + Mask = AMDGPU::DepCtr::encodeFieldVaVcc(Mask, 0); + } + if (Wait & WA_SALU) { + State.SALUHazards.reset(); + State.VCCHazard &= ~HazardState::SALU; + Mask = AMDGPU::DepCtr::encodeFieldSaSdst(Mask, 0); + } + if (Wait & WA_VALU) { + State.VALUHazards.reset(); + Mask = AMDGPU::DepCtr::encodeFieldVaSdst(Mask, 0); + } + if (Emit) { + auto NewMI = BuildMI(MBB, MI, MI->getDebugLoc(), + TII->get(AMDGPU::S_WAITCNT_DEPCTR)) + .addImm(Mask); + updateGetPCBundle(NewMI); + Emitted = true; + } + } + + // On return from a call SGPR state is unknown, so all potential hazards. + if (MI->isCall() && !CullSGPRHazardsOnFunctionBoundary) + State.Tracked.set(); + + // Update hazards based on defs. + SeenRegs.clear(); + for (const MachineOperand &Op : MI->all_defs()) { + if (Op.isImplicit() && + (!HasImplicitVCC || !Op.isReg() || !isVCC(Op.getReg()))) + continue; + processOperand(Op, false); + } + } + + bool Changed = State != BlockState[&MBB].Out; + if (Emit) { + assert(!Changed && "Hazard state should not change on emit pass"); + return Emitted; + } + if (Changed) + BlockState[&MBB].Out = State; + return Changed; + } + + bool run(MachineFunction &MF) { + const GCNSubtarget &ST = MF.getSubtarget(); + if (!ST.hasVALUReadSGPRHazard()) + return false; + + // Parse settings + EnableSGPRHazardWaits = GlobalEnableSGPRHazardWaits; + CullSGPRHazardsOnFunctionBoundary = GlobalCullSGPRHazardsOnFunctionBoundary; + CullSGPRHazardsAtMemWait = GlobalCullSGPRHazardsAtMemWait; + CullSGPRHazardsMemWaitThreshold = GlobalCullSGPRHazardsMemWaitThreshold; + + if (!GlobalEnableSGPRHazardWaits.getNumOccurrences()) + EnableSGPRHazardWaits = MF.getFunction().getFnAttributeAsParsedInteger( + "amdgpu-sgpr-hazard-wait", EnableSGPRHazardWaits); + if (!GlobalCullSGPRHazardsOnFunctionBoundary.getNumOccurrences()) + CullSGPRHazardsOnFunctionBoundary = + MF.getFunction().hasFnAttribute("amdgpu-sgpr-hazard-boundary-cull"); + if (!GlobalCullSGPRHazardsAtMemWait.getNumOccurrences()) + CullSGPRHazardsAtMemWait = + MF.getFunction().hasFnAttribute("amdgpu-sgpr-hazard-mem-wait-cull"); + if (!GlobalCullSGPRHazardsMemWaitThreshold.getNumOccurrences()) + CullSGPRHazardsMemWaitThreshold = + MF.getFunction().getFnAttributeAsParsedInteger( + "amdgpu-sgpr-hazard-mem-wait-cull-threshold", + CullSGPRHazardsMemWaitThreshold); + + // Bail if disabled + if (!EnableSGPRHazardWaits) + return false; + + TII = ST.getInstrInfo(); + TRI = ST.getRegisterInfo(); + MRI = &MF.getRegInfo(); + DsNopCount = ST.isWave64() ? WAVE64_NOPS : WAVE32_NOPS; + + auto CallingConv = MF.getFunction().getCallingConv(); + if (!AMDGPU::isEntryFunctionCC(CallingConv) && + !CullSGPRHazardsOnFunctionBoundary) { + // Callee must consider all SGPRs as tracked. + LLVM_DEBUG(dbgs() << "Is called function, track all SGPRs.\n"); + MachineBasicBlock &EntryBlock = MF.front(); + BlockState[&EntryBlock].In.Tracked.set(); + } + + // Calculate the hazard state for each basic block. + // Iterate until a fixed point is reached. + // Fixed point is guaranteed as merge function only ever increases + // the hazard set, and all backedges will cause a merge. + // + // Note: we have to take care of the entry block as this technically + // has an edge from outside the function. Failure to treat this as + // a merge could prevent fixed point being reached. + SetVector Worklist; + for (auto &MBB : reverse(MF)) + Worklist.insert(&MBB); + while (!Worklist.empty()) { + auto &MBB = *Worklist.pop_back_val(); + bool Changed = runOnMachineBasicBlock(MBB, false); + if (Changed) { + // Note: take a copy of state here in case it is reallocated by map + HazardState NewState = BlockState[&MBB].Out; + // Propagate to all successor blocks + for (auto Succ : MBB.successors()) { + // We only need to merge hazards at CFG merge points. + auto &SuccState = BlockState[Succ]; + if (Succ->getSinglePredecessor() && !Succ->isEntryBlock()) { + if (SuccState.In != NewState) { + SuccState.In = NewState; + Worklist.insert(Succ); + } + } else if (SuccState.In.merge(NewState)) { + Worklist.insert(Succ); + } + } + } + } + + LLVM_DEBUG(dbgs() << "Emit s_wait_alu instructions\n"); + + // Final to emit wait instructions. + bool Changed = false; + for (auto &MBB : MF) + Changed |= runOnMachineBasicBlock(MBB, true); + + BlockState.clear(); + return Changed; + } +}; + +class AMDGPUWaitSGPRHazardsLegacy : public MachineFunctionPass { +public: + static char ID; + + AMDGPUWaitSGPRHazardsLegacy() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override { + return AMDGPUWaitSGPRHazards().run(MF); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // namespace + +char AMDGPUWaitSGPRHazardsLegacy::ID = 0; + +char &llvm::AMDGPUWaitSGPRHazardsLegacyID = AMDGPUWaitSGPRHazardsLegacy::ID; + +INITIALIZE_PASS(AMDGPUWaitSGPRHazardsLegacy, DEBUG_TYPE, + "AMDGPU Insert waits for SGPR read hazards", false, false) + +PreservedAnalyses +AMDGPUWaitSGPRHazardsPass::run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM) { + if (AMDGPUWaitSGPRHazards().run(MF)) + return PreservedAnalyses::none(); + return PreservedAnalyses::all(); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.h b/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.h new file mode 100644 index 0000000000000..58e9bca4c3ede --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.h @@ -0,0 +1,25 @@ +//===--- AMDGPUWaitSGPRHazards.h --------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUWAITSGPRHAZARDS_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUWAITSGPRHAZARDS_H + +#include "llvm/CodeGen/MachinePassManager.h" + +namespace llvm { + +class AMDGPUWaitSGPRHazardsPass + : public PassInfoMixin { +public: + PreservedAnalyses run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM); +}; + +} // namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUWAITSGPRHAZARDS_H diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 91cae76256306..408da0536237e 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -110,6 +110,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUTargetMachine.cpp AMDGPUTargetObjectFile.cpp AMDGPUTargetTransformInfo.cpp + AMDGPUWaitSGPRHazards.cpp AMDGPUUnifyDivergentExitNodes.cpp AMDGPUUnifyMetadata.cpp R600MachineCFGStructurizer.cpp diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 873d18e30a430..a21702af11a98 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -14,7 +14,6 @@ #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIMachineFunctionInfo.h" -#include "llvm/ADT/PostOrderIterator.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/ScheduleDAG.h" @@ -45,10 +44,6 @@ static cl::opt cl::desc("Fill a percentage of the latency between " "neighboring MFMA with s_nops.")); -static cl::opt MaxExhaustiveHazardSearch( - "amdgpu-max-exhaustive-hazard-search", cl::init(128), cl::Hidden, - cl::desc("Maximum function size for exhausive hazard search")); - //===----------------------------------------------------------------------===// // Hazard Recognizer Implementation //===----------------------------------------------------------------------===// @@ -60,7 +55,6 @@ GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) : IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF), ST(MF.getSubtarget()), TII(*ST.getInstrInfo()), TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()), - UseVALUReadHazardExhaustiveSearch(false), ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) { MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5; RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST); @@ -1217,7 +1211,6 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) { fixWMMAHazards(MI); fixShift64HighRegBug(MI); fixVALUMaskWriteHazard(MI); - fixVALUReadSGPRHazard(MI); fixRequiredExportPriority(MI); } @@ -3106,274 +3099,6 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) { return true; } -// Return the numeric ID 0-63 of an 64b SGPR pair for a given SGPR. -// i.e. SGPR0 = SGPR0_SGPR1 = 0, SGPR3 = SGPR2_SGPR3 = 1, etc -static std::optional sgprPairNumber(Register Reg, - const SIRegisterInfo &TRI) { - switch (Reg) { - case AMDGPU::M0: - case AMDGPU::EXEC: - case AMDGPU::EXEC_LO: - case AMDGPU::EXEC_HI: - case AMDGPU::SGPR_NULL: - case AMDGPU::SGPR_NULL64: - return {}; - default: - break; - } - unsigned RegN = TRI.getEncodingValue(Reg); - if (RegN > 127) - return {}; - return (RegN >> 1) & 0x3f; -} - -// For VALUReadSGPRHazard: pre-compute a bit vector of all SGPRs used by VALUs. -void GCNHazardRecognizer::computeVALUHazardSGPRs(MachineFunction *MMF) { - assert(MMF == &MF); - - // Assume non-empty vector means it has already been computed. - if (!VALUReadHazardSGPRs.empty()) - return; - - auto CallingConv = MF.getFunction().getCallingConv(); - bool IsCallFree = - AMDGPU::isEntryFunctionCC(CallingConv) && !MF.getFrameInfo().hasCalls(); - - // Exhaustive search is only viable in non-caller/callee functions where - // VALUs will be exposed to the hazard recognizer. - UseVALUReadHazardExhaustiveSearch = - IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None && - MF.getInstructionCount() <= MaxExhaustiveHazardSearch; - - // Consider all SGPRs hazards if the shader uses function calls or is callee. - bool UseVALUUseCache = - IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None; - VALUReadHazardSGPRs.resize(64, !UseVALUUseCache); - if (!UseVALUUseCache) - return; - - // Perform a post ordered reverse scan to find VALUs which read an SGPR - // before a SALU write to the same SGPR. This provides a reduction in - // hazard insertion when all VALU access to an SGPR occurs after its last - // SALU write, when compared to a linear scan. - const MachineRegisterInfo &MRI = MF.getRegInfo(); - BitVector SALUWriteSGPRs(64), ReadSGPRs(64); - MachineCycleInfo CI; - CI.compute(*MMF); - - for (auto *MBB : post_order(&MF)) { - bool InCycle = CI.getCycle(MBB) != nullptr; - for (auto &MI : reverse(MBB->instrs())) { - bool IsVALU = SIInstrInfo::isVALU(MI); - bool IsSALU = SIInstrInfo::isSALU(MI); - if (!IsVALU && !IsSALU) - continue; - - for (const MachineOperand &Op : MI.operands()) { - if (!Op.isReg()) - continue; - Register Reg = Op.getReg(); - assert(!Op.getSubReg()); - // Only consider implicit operands of VCC. - if (Op.isImplicit() && !(Reg == AMDGPU::VCC_LO || - Reg == AMDGPU::VCC_HI || Reg == AMDGPU::VCC)) - continue; - if (!TRI.isSGPRReg(MRI, Reg)) - continue; - auto RegN = sgprPairNumber(Reg, TRI); - if (!RegN) - continue; - if (IsVALU && Op.isUse()) { - // Note: any access within a cycle must be considered a hazard. - if (InCycle || (ReadSGPRs[*RegN] && SALUWriteSGPRs[*RegN])) - VALUReadHazardSGPRs.set(*RegN); - ReadSGPRs.set(*RegN); - } else if (IsSALU) { - if (Op.isDef()) - SALUWriteSGPRs.set(*RegN); - else - ReadSGPRs.set(*RegN); - } - } - } - } -} - -bool GCNHazardRecognizer::fixVALUReadSGPRHazard(MachineInstr *MI) { - if (!ST.hasVALUReadSGPRHazard()) - return false; - - // The hazard sequence is fundamentally three instructions: - // 1. VALU reads SGPR - // 2. SALU writes SGPR - // 3. VALU/SALU reads SGPR - // Try to avoid searching for (1) because the expiry point of the hazard is - // indeterminate; however, the hazard between (2) and (3) can expire if the - // gap contains sufficient SALU instructions with no usage of SGPR from (1). - // Note: SGPRs must be considered as 64-bit pairs as hazard exists - // even if individual SGPRs are accessed. - - bool MIIsSALU = SIInstrInfo::isSALU(*MI); - bool MIIsVALU = SIInstrInfo::isVALU(*MI); - if (!(MIIsSALU || MIIsVALU)) - return false; - - // Avoid expensive search when compile time is priority by - // mitigating every SALU which writes an SGPR. - if (MF.getTarget().getOptLevel() == CodeGenOptLevel::None) { - if (!SIInstrInfo::isSALU(*MI) || SIInstrInfo::isSOPP(*MI)) - return false; - - const MachineOperand *SDSTOp = - TII.getNamedOperand(*MI, AMDGPU::OpName::sdst); - if (!SDSTOp || !SDSTOp->isReg()) - return false; - - const Register HazardReg = SDSTOp->getReg(); - if (HazardReg == AMDGPU::EXEC || HazardReg == AMDGPU::EXEC_LO || - HazardReg == AMDGPU::EXEC_HI || HazardReg == AMDGPU::M0) - return false; - - // Add s_wait_alu sa_sdst(0) after SALU write. - auto NextMI = std::next(MI->getIterator()); - auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(), - TII.get(AMDGPU::S_WAITCNT_DEPCTR)) - .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); - - // SALU write may be s_getpc in a bundle. - updateGetPCBundle(NewMI); - - return true; - } - - // Pre-compute set of SGPR pairs read by VALUs. - // Note: pass mutable pointer to MachineFunction for CycleInfo. - computeVALUHazardSGPRs(MI->getMF()); - - // If no VALUs hazard SGPRs exist then nothing to do. - if (VALUReadHazardSGPRs.none()) - return false; - - // All SGPR writes before a call/return must be flushed as the callee/caller - // will not will not see the hazard chain, i.e. (2) to (3) described above. - const bool IsSetPC = (MI->isCall() || MI->isReturn()) && - !(MI->getOpcode() == AMDGPU::S_ENDPGM || - MI->getOpcode() == AMDGPU::S_ENDPGM_SAVED); - - // Collect all SGPR sources for MI which are read by a VALU. - const MachineRegisterInfo &MRI = MF.getRegInfo(); - SmallSet SGPRsUsed; - - if (!IsSetPC) { - for (const MachineOperand &Op : MI->all_uses()) { - Register OpReg = Op.getReg(); - - // Only consider VCC implicit uses on VALUs. - // The only expected SALU implicit access is SCC which is no hazard. - if (MIIsSALU && Op.isImplicit()) - continue; - - if (!TRI.isSGPRReg(MRI, OpReg)) - continue; - - auto RegN = sgprPairNumber(OpReg, TRI); - if (!RegN) - continue; - - if (!VALUReadHazardSGPRs[*RegN]) - continue; - - SGPRsUsed.insert(OpReg); - } - - // No SGPRs -> nothing to do. - if (SGPRsUsed.empty()) - return false; - } - - // A hazard is any SALU which writes one of the SGPRs read by MI. - auto IsHazardFn = [this, IsSetPC, &SGPRsUsed](const MachineInstr &I) { - if (!SIInstrInfo::isSALU(I)) - return false; - // Ensure SGPR flush before call/return by conservatively assuming every - // SALU writes an SGPR. - if (IsSetPC && I.getNumDefs() > 0) - return true; - // Check for any register writes. - return any_of(SGPRsUsed, [this, &I](Register Reg) { - return I.modifiesRegister(Reg, &TRI); - }); - }; - - const int SALUExpiryCount = SIInstrInfo::isSALU(*MI) ? 10 : 11; - auto IsExpiredFn = [&](const MachineInstr &I, int Count) { - if (Count >= SALUExpiryCount) - return true; - // s_wait_alu sa_sdst(0) on path mitigates hazard. - if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && - AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0) - return true; - return false; - }; - - auto WaitStatesFn = [this, &SGPRsUsed](const MachineInstr &I) { - // Only count true SALUs as wait states. - if (!SIInstrInfo::isSALU(I) || SIInstrInfo::isSOPP(I)) - return 0; - // SALU must be unrelated to any hazard registers. - if (any_of(SGPRsUsed, - [this, &I](Register Reg) { return I.readsRegister(Reg, &TRI); })) - return 0; - return 1; - }; - - // Check for the hazard. - DenseSet Visited; - int WaitStates = ::getWaitStatesSince(IsHazardFn, MI->getParent(), - std::next(MI->getReverseIterator()), 0, - IsExpiredFn, Visited, WaitStatesFn); - - if (WaitStates >= SALUExpiryCount) - return false; - - // Validate hazard through an exhaustive search. - if (UseVALUReadHazardExhaustiveSearch) { - // A hazard is any VALU which reads one of the paired SGPRs read by MI. - // This is searching for (1) in the hazard description. - auto hazardPair = [this](Register Reg) { - if (Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI) - return Register(AMDGPU::VCC); - auto RegN = sgprPairNumber(Reg, TRI); - return Register(AMDGPU::SGPR0_SGPR1 + *RegN); - }; - auto SearchHazardFn = [this, hazardPair, - &SGPRsUsed](const MachineInstr &I) { - if (!SIInstrInfo::isVALU(I)) - return false; - // Check for any register reads. - return any_of(SGPRsUsed, [this, hazardPair, &I](Register Reg) { - return I.readsRegister(hazardPair(Reg), &TRI); - }); - }; - auto SearchExpiredFn = [&](const MachineInstr &I, int Count) { - return false; - }; - if (::getWaitStatesSince(SearchHazardFn, MI, SearchExpiredFn) == - std::numeric_limits::max()) - return false; - } - - // Add s_wait_alu sa_sdst(0) before SALU read. - auto NewMI = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), - TII.get(AMDGPU::S_WAITCNT_DEPCTR)) - .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); - - // SALU read may be after s_getpc in a bundle. - updateGetPCBundle(NewMI); - - return true; -} - static bool ensureEntrySetPrio(MachineFunction *MF, int Priority, const SIInstrInfo &TII) { MachineBasicBlock &EntryMBB = MF->front(); diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h index 83ce100c58f0a..bbc55851bf967 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -48,8 +48,6 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer { const SIRegisterInfo &TRI; const TargetSchedModel &TSchedModel; bool RunLdsBranchVmemWARHazardFixup; - BitVector VALUReadHazardSGPRs; - bool UseVALUReadHazardExhaustiveSearch; /// RegUnits of uses in the current soft memory clause. BitVector ClauseUses; @@ -109,8 +107,6 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer { bool fixWMMAHazards(MachineInstr *MI); bool fixShift64HighRegBug(MachineInstr *MI); bool fixVALUMaskWriteHazard(MachineInstr *MI); - void computeVALUHazardSGPRs(MachineFunction *MMF); - bool fixVALUReadSGPRHazard(MachineInstr *MI); bool fixRequiredExportPriority(MachineInstr *MI); int checkMAIHazards(MachineInstr *MI); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index d9c0aa300855f..59afcbed35294 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -134,6 +134,12 @@ unsigned getLoadcntStorecntBitShift(unsigned VersionMajor) { return VersionMajor >= 12 ? 8 : 0; } +/// \returns VaSdst bit width +inline unsigned getVaSdstBitWidth() { return 3; } + +/// \returns VaSdst bit shift +inline unsigned getVaSdstBitShift() { return 9; } + /// \returns VmVsrc bit width inline unsigned getVmVsrcBitWidth() { return 3; } @@ -146,6 +152,12 @@ inline unsigned getVaVdstBitWidth() { return 4; } /// \returns VaVdst bit shift inline unsigned getVaVdstBitShift() { return 12; } +/// \returns VaVcc bit width +inline unsigned getVaVccBitWidth() { return 1; } + +/// \returns VaVcc bit shift +inline unsigned getVaVccBitShift() { return 1; } + /// \returns SaSdst bit width inline unsigned getSaSdstBitWidth() { return 1; } @@ -1719,6 +1731,14 @@ unsigned decodeFieldSaSdst(unsigned Encoded) { return unpackBits(Encoded, getSaSdstBitShift(), getSaSdstBitWidth()); } +unsigned decodeFieldVaSdst(unsigned Encoded) { + return unpackBits(Encoded, getVaSdstBitShift(), getVaSdstBitWidth()); +} + +unsigned decodeFieldVaVcc(unsigned Encoded) { + return unpackBits(Encoded, getVaVccBitShift(), getVaVccBitWidth()); +} + unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc) { return packBits(VmVsrc, Encoded, getVmVsrcBitShift(), getVmVsrcBitWidth()); } @@ -1743,6 +1763,22 @@ unsigned encodeFieldSaSdst(unsigned SaSdst) { return encodeFieldSaSdst(0xffff, SaSdst); } +unsigned encodeFieldVaSdst(unsigned Encoded, unsigned VaSdst) { + return packBits(VaSdst, Encoded, getVaSdstBitShift(), getVaSdstBitWidth()); +} + +unsigned encodeFieldVaSdst(unsigned VaSdst) { + return encodeFieldVaSdst(0xffff, VaSdst); +} + +unsigned encodeFieldVaVcc(unsigned Encoded, unsigned VaVcc) { + return packBits(VaVcc, Encoded, getVaVccBitShift(), getVaVccBitWidth()); +} + +unsigned encodeFieldVaVcc(unsigned VaVcc) { + return encodeFieldVaVcc(0xffff, VaVcc); +} + } // namespace DepCtr //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 3c9246d5e107d..fad7e67ff3c76 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1168,6 +1168,12 @@ unsigned decodeFieldVmVsrc(unsigned Encoded); /// \returns Decoded SaSdst from given immediate \p Encoded. unsigned decodeFieldSaSdst(unsigned Encoded); +/// \returns Decoded VaSdst from given immediate \p Encoded. +unsigned decodeFieldVaSdst(unsigned Encoded); + +/// \returns Decoded VaVcc from given immediate \p Encoded. +unsigned decodeFieldVaVcc(unsigned Encoded); + /// \returns \p VmVsrc as an encoded Depctr immediate. unsigned encodeFieldVmVsrc(unsigned VmVsrc); @@ -1186,6 +1192,18 @@ unsigned encodeFieldSaSdst(unsigned SaSdst); /// \returns \p Encoded combined with encoded \p SaSdst. unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst); +/// \returns \p VaSdst as an encoded Depctr immediate. +unsigned encodeFieldVaSdst(unsigned VaSdst); + +/// \returns \p Encoded combined with encoded \p VaSdst. +unsigned encodeFieldVaSdst(unsigned Encoded, unsigned VaSdst); + +/// \returns \p VaVcc as an encoded Depctr immediate. +unsigned encodeFieldVaVcc(unsigned VaVcc); + +/// \returns \p Encoded combined with encoded \p VaVcc. +unsigned encodeFieldVaVcc(unsigned Encoded, unsigned VaVcc); + } // namespace DepCtr namespace Exp { diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll index 23f24a9dc9982..424388a30e99b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll @@ -624,7 +624,6 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: @@ -780,7 +779,6 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_cbranch_execnz .LBB7_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: @@ -1212,7 +1210,6 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: @@ -1366,7 +1363,6 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: @@ -1822,14 +1818,12 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: @@ -1994,14 +1988,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] ; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll index 11024b0a88d6b..b52a39f1a55c8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll @@ -624,7 +624,6 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: @@ -780,7 +779,6 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_cbranch_execnz .LBB7_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: @@ -1212,7 +1210,6 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: @@ -1366,7 +1363,6 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: @@ -1822,14 +1818,12 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: @@ -1994,14 +1988,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] ; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll index d62da6921b347..e1397e7331d3c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll @@ -2853,6 +2853,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: flat_store_b32 v[0:1], v3 @@ -3840,6 +3841,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll index 43f3dcc86f426..a948446aceff1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll @@ -28,9 +28,11 @@ define i32 @v_extract_v64i32_varidx(ptr addrspace(1) %ptr, i32 %idx) { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 2, v2 ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: global_load_b32 v0, v[0:1], off ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] %vec = load <64 x i32>, ptr addrspace(1) %ptr %elt = extractelement <64 x i32> %vec, i32 %idx @@ -60,9 +62,11 @@ define i16 @v_extract_v128i16_varidx(ptr addrspace(1) %ptr, i32 %idx) { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 1, v2 ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: global_load_u16 v0, v[0:1], off ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] %vec = load <128 x i16>, ptr addrspace(1) %ptr %elt = extractelement <128 x i16> %vec, i32 %idx @@ -92,9 +96,11 @@ define i64 @v_extract_v32i64_varidx(ptr addrspace(1) %ptr, i32 %idx) { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v2 ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: global_load_b64 v[0:1], v[0:1], off ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] %vec = load <32 x i64>, ptr addrspace(1) %ptr %elt = extractelement <32 x i64> %vec, i32 %idx diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll index 9ef54ed724ec0..bf8e10143003a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll @@ -171,6 +171,7 @@ define float @v_rsq_clamp_undef_f32() #0 { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_s_rsq_f32 s0, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0xff7fffff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_minmax_num_f32 v0, s0, 0x7f7fffff, v0 ; GFX12-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll index 2c71366772fc9..31526bcfead4e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll @@ -832,6 +832,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4294967296(ptr addrspace(1) %p ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 4, v1, vcc_lo ; GFX12-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: ; return to shader part epilog %gep = getelementptr float, ptr addrspace(1) %ptr, i64 4294967296 %val = load volatile float, ptr addrspace(1) %gep @@ -865,6 +866,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4294967297(ptr addrspace(1) %p ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 4, v1, vcc_lo ; GFX12-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: ; return to shader part epilog %gep = getelementptr float, ptr addrspace(1) %ptr, i64 4294967297 %val = load volatile float, ptr addrspace(1) %gep @@ -979,6 +981,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset(ptr addrspace(1) %ptr, i ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX12-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: ; return to shader part epilog %gep = getelementptr float, ptr addrspace(1) %ptr, i32 %soffset %val = load volatile float, ptr addrspace(1) %gep @@ -1017,6 +1020,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset_offset256(ptr addrspace( ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX12-NEXT: global_load_b32 v0, v[0:1], off offset:1024 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: ; return to shader part epilog %gep0 = getelementptr float, ptr addrspace(1) %ptr, i32 %soffset %gep1 = getelementptr float, ptr addrspace(1) %gep0, i32 256 @@ -1056,6 +1060,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset256_offset(ptr addrspace( ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX12-NEXT: global_load_b32 v0, v[0:1], off offset:1024 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: ; return to shader part epilog %gep0 = getelementptr float, ptr addrspace(1) %ptr, i64 256 %gep1 = getelementptr float, ptr addrspace(1) %gep0, i32 %soffset @@ -1099,6 +1104,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_vgpr_offset(ptr addrspace(1) inreg % ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo ; GFX12-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: ; return to shader part epilog %gep = getelementptr float, ptr addrspace(1) %ptr, i32 %voffset %val = load volatile float, ptr addrspace(1) %gep @@ -1143,6 +1149,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_vgpr_offset_offset4095(ptr addrspace ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo ; GFX12-NEXT: global_load_b32 v0, v[0:1], off offset:16380 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: ; return to shader part epilog %gep0 = getelementptr float, ptr addrspace(1) %ptr, i32 %voffset %gep1 = getelementptr float, ptr addrspace(1) %gep0, i64 4095 @@ -1187,6 +1194,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4095_vgpr_offset(ptr addrspace ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo ; GFX12-NEXT: global_load_b32 v0, v[0:1], off offset:16380 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: ; return to shader part epilog %gep0 = getelementptr float, ptr addrspace(1) %ptr, i64 4095 %gep1 = getelementptr float, ptr addrspace(1) %gep0, i32 %voffset @@ -1359,6 +1367,7 @@ define amdgpu_ps float @mubuf_atomicrmw_vgpr_ptr_offset4294967296(ptr addrspace( ; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: ; return to shader part epilog %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296 %result = atomicrmw add ptr addrspace(1) %gep, i32 2 syncscope("agent") seq_cst @@ -1411,6 +1420,7 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_vgpr_offset(ptr addrspace(1) in ; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v4, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: ; return to shader part epilog %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %voffset %result = atomicrmw add ptr addrspace(1) %gep, i32 2 syncscope("agent") seq_cst @@ -1590,6 +1600,7 @@ define amdgpu_ps float @mubuf_cmpxchg_vgpr_ptr_offset4294967296(ptr addrspace(1) ; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: ; return to shader part epilog %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296 %result.struct = cmpxchg ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst @@ -1643,6 +1654,7 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_vgpr_offset(ptr addrspace(1) inre ; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: ; return to shader part epilog %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %voffset %result.struct = cmpxchg ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll index 7c6daf769aec2..ba2af13338be6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -1072,14 +1072,17 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v2, v11 ; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[1:2] +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v7, s0, v12, v7, s0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v7, v6, vcc_lo ; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], null, v10, v5, v[6:7] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v3, v4, v[5:6] +; GFX12-NEXT: s_wait_alu 0xf1fd ; GFX12-NEXT: s_setpc_b64 s[30:31] %result = mul i128 %num, %den ret i128 %result @@ -2433,71 +2436,83 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v16, v12, 0 ; GFX12-NEXT: v_mul_lo_u32 v30, v17, v14 ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v17, v13, v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[18:19] +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, v12, v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo ; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], null, v16, v10, 0 -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v3, v11, v[0:1] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v3, v11, v[0:1] ; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v10, v[0:1] ; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v9, v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_mad_co_u64_u32 v[22:23], null, v6, v8, v[0:1] ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[20:21] +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v25, 0, 1, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_mov_b32_e32 v20, v22 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v16, v13, v[19:20] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_mov_b32_e32 v19, v22 ; GFX12-NEXT: v_mul_lo_u32 v22, v16, v15 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_mad_co_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1] ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v8, 0 ; GFX12-NEXT: v_mov_b32_e32 v20, v18 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-NEXT: v_mad_co_u64_u32 v[14:15], s2, v16, v11, v[19:20] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s1, v2, v11, v[24:25] ; GFX12-NEXT: v_mul_lo_u32 v20, v4, v11 ; GFX12-NEXT: v_mul_lo_u32 v25, v3, v12 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v6, 0, 1, s2 ; GFX12-NEXT: v_mul_lo_u32 v24, v2, v13 ; GFX12-NEXT: v_mov_b32_e32 v13, v1 ; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], s2, v17, v10, v[14:15] ; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s3, v3, v10, v[18:19] +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s2, 0, v6, s2 ; GFX12-NEXT: v_mov_b32_e32 v14, v21 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s2, v2, v9, v[11:12] +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s2, 0, v6, s2 ; GFX12-NEXT: v_mad_co_u64_u32 v[10:11], s2, v4, v9, v[18:19] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_mad_co_u64_u32 v[12:13], s4, v16, v9, v[13:14] +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v9, 0, 1, s4 ; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], s4, v3, v8, v[1:2] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v14, s4, 0, v6, s4 ; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], s4, v5, v8, v[10:11] ; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[12:13] +; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s5, v9, v3, s5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v4, s5, v29, v4, s5 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v5, s5, v14, v5, s5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s5, v26, v6, s5 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s5, v23, v22, s5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s4, v9, v30, s4 @@ -2505,11 +2520,13 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s2, v9, v25, s3 ; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s1, v9, v20, s1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo ; GFX12-NEXT: v_add_co_ci_u32_e64 v9, vcc_lo, v9, v27, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mad_co_u64_u32 v[7:8], null, v7, v8, v[9:10] +; GFX12-NEXT: s_wait_alu 0xf1fd ; GFX12-NEXT: s_setpc_b64 s[30:31] %result = mul i256 %num, %den ret i256 %result diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll index a9e092fa39fbe..cd405fabf002d 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll @@ -220,7 +220,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 -; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 @@ -240,6 +239,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W64-NEXT: s_wait_alu 0xf1ff ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -257,7 +257,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_mul_i32 s1, s1, 5 @@ -266,7 +265,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB0_2: -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 @@ -499,7 +497,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 -; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 @@ -519,6 +516,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 +; GFX12W64-NEXT: s_wait_alu 0xf1ff ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3] ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 @@ -537,7 +535,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: s_wait_alu 0xfffe @@ -883,7 +880,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_cbranch_execz .LBB2_4 ; GFX12W64-NEXT: ; %bb.3: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 -; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN @@ -893,7 +889,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX12W64-NEXT: s_wait_alu 0xfffe +; GFX12W64-NEXT: s_wait_alu 0xf1ff ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -908,7 +904,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 @@ -940,6 +935,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 +; GFX12W32-NEXT: s_wait_alu 0xf1ff ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 @@ -1292,7 +1288,6 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX12W64-NEXT: s_clause 0x1 ; GFX12W64-NEXT: s_load_b32 s3, s[4:5], 0x44 ; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 -; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: v_mov_b32_e32 v2, s3 @@ -1303,7 +1298,7 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX12W64-NEXT: s_wait_alu 0xfffe +; GFX12W64-NEXT: s_wait_alu 0xf1ff ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -1318,7 +1313,6 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB3_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 @@ -1352,6 +1346,7 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 +; GFX12W32-NEXT: s_wait_alu 0xf1ff ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 @@ -1686,7 +1681,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 -; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 @@ -1707,6 +1701,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W64-NEXT: s_wait_alu 0xf1ff ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -1724,7 +1719,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_mul_i32 s1, s1, 5 @@ -1733,7 +1727,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB5_2: -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 @@ -1970,7 +1963,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 -; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 @@ -1992,6 +1984,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W64-NEXT: s_wait_alu 0xf1ff ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2009,7 +2002,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: s_cbranch_execz .LBB6_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: s_wait_alu 0xfffe @@ -2025,6 +2017,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W32-NEXT: s_wait_alu 0xf1ff ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3] @@ -2357,7 +2350,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_cbranch_execz .LBB7_4 ; GFX12W64-NEXT: ; %bb.3: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 -; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN @@ -2367,7 +2359,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX12W64-NEXT: s_wait_alu 0xfffe +; GFX12W64-NEXT: s_wait_alu 0xf1ff ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -2382,7 +2374,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB7_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 @@ -2415,6 +2406,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W32-NEXT: s_wait_alu 0xf1ff ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 9577230c6c52e..adc91d56c3c27 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -262,7 +262,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264-NEXT: s_mov_b64 s[4:5], exec ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1264-NEXT: ; implicit-def: $vgpr1 -; GFX1264-NEXT: s_wait_alu 0xfffe ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0 @@ -586,7 +585,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: s_mov_b64 s[4:5], exec ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1264-NEXT: ; implicit-def: $vgpr1 -; GFX1264-NEXT: s_wait_alu 0xfffe ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0 @@ -629,7 +627,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1232-NEXT: s_cbranch_execz .LBB1_2 ; GFX1232-NEXT: ; %bb.1: -; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: s_bcnt1_i32_b32 s6, s6 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1232-NEXT: s_wait_kmcnt 0x0 @@ -1018,7 +1015,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1264_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 ; GFX1264_ITERATIVE-NEXT: ; %bb.3: -; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v1, s6 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s10, -1 @@ -1047,7 +1043,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1232_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v1, s1 @@ -1509,6 +1504,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1264_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1264_DPP-NEXT: s_wait_alu 0xf1ff ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX1264_DPP-NEXT: s_mov_b32 s2, s6 @@ -1543,7 +1539,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16 -; GFX1232_DPP-NEXT: s_wait_alu 0xfffe ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 ; GFX1232_DPP-NEXT: s_mov_b32 s4, s6 ; GFX1232_DPP-NEXT: s_mov_b32 s6, -1 @@ -1843,7 +1838,6 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264-NEXT: s_mov_b32 s9, 0 ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1264-NEXT: s_mov_b64 s[4:5], exec -; GFX1264-NEXT: s_wait_alu 0xfffe ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -1887,7 +1881,6 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1232-NEXT: s_cbranch_execz .LBB3_2 ; GFX1232-NEXT: ; %bb.1: -; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: s_bcnt1_i32_b32 s4, s7 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1232-NEXT: s_mul_u64 s[4:5], s[4:5], 5 @@ -2227,7 +2220,6 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: s_mov_b32 s11, 0 ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1264-NEXT: s_mov_b64 s[6:7], exec -; GFX1264-NEXT: s_wait_alu 0xfffe ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -2274,7 +2266,6 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1232-NEXT: s_cbranch_execz .LBB4_2 ; GFX1232-NEXT: ; %bb.1: -; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: s_bcnt1_i32_b32 s6, s6 ; GFX1232-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1232-NEXT: s_wait_kmcnt 0x0 @@ -2719,7 +2710,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1264_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 ; GFX1264_ITERATIVE-NEXT: ; %bb.3: -; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v2, s6 ; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, s7 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 @@ -2751,7 +2741,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1232_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1 @@ -3378,20 +3367,22 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: s_wait_alu 0xfffd ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1264_DPP-NEXT: s_wait_alu 0xfffd ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1264_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 +; GFX1264_DPP-NEXT: s_wait_alu 0xfffd ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf @@ -3399,6 +3390,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 ; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1264_DPP-NEXT: s_wait_alu 0xfffd ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc ; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, 0 @@ -3407,9 +3399,11 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, s2 ; GFX1264_DPP-NEXT: v_readlane_b32 s2, v2, 31 ; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1264_DPP-NEXT: s_wait_alu 0xf1ff +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1264_DPP-NEXT: s_wait_alu 0xfffd ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -3461,9 +3455,11 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, v5 ; GFX1264_DPP-NEXT: v_readfirstlane_b32 s3, v7 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1264_DPP-NEXT: s_wait_alu 0xf1ff +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_add_co_u32 v6, vcc, s2, v8 ; GFX1264_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1264_DPP-NEXT: s_wait_alu 0xfffd ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v7, vcc, s3, v9, vcc ; GFX1264_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1264_DPP-NEXT: buffer_store_b64 v[6:7], off, s[0:3], null @@ -3486,20 +3482,22 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo ; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1232_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: s_wait_alu 0xfffd ; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1232_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1232_DPP-NEXT: s_wait_alu 0xfffd ; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo ; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc_lo, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1232_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1232_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 +; GFX1232_DPP-NEXT: s_wait_alu 0xfffd ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo ; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc_lo, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf @@ -3523,7 +3521,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1232_DPP-NEXT: v_writelane_b32 v6, s7, 16 ; GFX1232_DPP-NEXT: v_writelane_b32 v7, s8, 16 -; GFX1232_DPP-NEXT: s_wait_alu 0xfffe ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s6 ; GFX1232_DPP-NEXT: s_mov_b32 s6, -1 ; GFX1232_DPP-NEXT: s_mov_b32 s8, exec_lo @@ -3547,9 +3544,10 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: v_mov_b32_e32 v10, v6 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v11, v7 ; GFX1232_DPP-NEXT: v_readfirstlane_b32 s3, v9 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_add_co_u32 v8, vcc_lo, s2, v10 ; GFX1232_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1232_DPP-NEXT: s_wait_alu 0xfffd ; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, s3, v11, vcc_lo ; GFX1232_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1232_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], null @@ -3809,7 +3807,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264-NEXT: s_mov_b64 s[4:5], exec ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1264-NEXT: ; implicit-def: $vgpr1 -; GFX1264-NEXT: s_wait_alu 0xfffe ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0 @@ -4139,7 +4136,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: s_mov_b64 s[4:5], exec ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1264-NEXT: ; implicit-def: $vgpr1 -; GFX1264-NEXT: s_wait_alu 0xfffe ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0 @@ -4183,7 +4179,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1232-NEXT: s_cbranch_execz .LBB7_2 ; GFX1232-NEXT: ; %bb.1: -; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: s_bcnt1_i32_b32 s6, s6 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1232-NEXT: s_wait_kmcnt 0x0 @@ -4573,7 +4568,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1264_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 ; GFX1264_ITERATIVE-NEXT: ; %bb.3: -; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v1, s6 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s10, -1 @@ -4602,7 +4596,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1232_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v1, s1 @@ -5064,6 +5057,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1264_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1264_DPP-NEXT: s_wait_alu 0xf1ff ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1264_DPP-NEXT: s_mov_b32 s2, s6 @@ -5098,7 +5092,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16 -; GFX1232_DPP-NEXT: s_wait_alu 0xfffe ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 ; GFX1232_DPP-NEXT: s_mov_b32 s4, s6 ; GFX1232_DPP-NEXT: s_mov_b32 s6, -1 @@ -5412,7 +5405,6 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264-NEXT: s_mov_b32 s9, 0 ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1264-NEXT: s_mov_b64 s[4:5], exec -; GFX1264-NEXT: s_wait_alu 0xfffe ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -5459,7 +5451,6 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1232-NEXT: s_cbranch_execz .LBB9_2 ; GFX1232-NEXT: ; %bb.1: -; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: s_bcnt1_i32_b32 s4, s7 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1232-NEXT: s_mul_u64 s[4:5], s[4:5], 5 @@ -5813,7 +5804,6 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: s_mov_b32 s11, 0 ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1264-NEXT: s_mov_b64 s[6:7], exec -; GFX1264-NEXT: s_wait_alu 0xfffe ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -5864,7 +5854,6 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1232-NEXT: s_cbranch_execz .LBB10_2 ; GFX1232-NEXT: ; %bb.1: -; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: s_bcnt1_i32_b32 s6, s6 ; GFX1232-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1232-NEXT: s_wait_kmcnt 0x0 @@ -6313,7 +6302,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1264_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 ; GFX1264_ITERATIVE-NEXT: ; %bb.3: -; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v2, s6 ; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, s7 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 @@ -6345,7 +6333,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1232_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1 @@ -6972,20 +6959,22 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: s_wait_alu 0xfffd ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1264_DPP-NEXT: s_wait_alu 0xfffd ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1264_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 +; GFX1264_DPP-NEXT: s_wait_alu 0xfffd ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf @@ -6993,6 +6982,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 ; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1264_DPP-NEXT: s_wait_alu 0xfffd ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc ; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, 0 @@ -7001,9 +6991,11 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, s2 ; GFX1264_DPP-NEXT: v_readlane_b32 s2, v2, 31 ; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1264_DPP-NEXT: s_wait_alu 0xf1ff +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1264_DPP-NEXT: s_wait_alu 0xfffd ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -7055,9 +7047,11 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, v5 ; GFX1264_DPP-NEXT: v_readfirstlane_b32 s3, v7 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1264_DPP-NEXT: s_wait_alu 0xf1ff +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_sub_co_u32 v6, vcc, s2, v8 ; GFX1264_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1264_DPP-NEXT: s_wait_alu 0xfffd ; GFX1264_DPP-NEXT: v_sub_co_ci_u32_e32 v7, vcc, s3, v9, vcc ; GFX1264_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1264_DPP-NEXT: buffer_store_b64 v[6:7], off, s[0:3], null @@ -7080,20 +7074,22 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo ; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1232_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: s_wait_alu 0xfffd ; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1232_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1232_DPP-NEXT: s_wait_alu 0xfffd ; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo ; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc_lo, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1232_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1232_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 +; GFX1232_DPP-NEXT: s_wait_alu 0xfffd ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo ; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc_lo, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf @@ -7117,7 +7113,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1232_DPP-NEXT: v_writelane_b32 v6, s7, 16 ; GFX1232_DPP-NEXT: v_writelane_b32 v7, s8, 16 -; GFX1232_DPP-NEXT: s_wait_alu 0xfffe ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s6 ; GFX1232_DPP-NEXT: s_mov_b32 s6, -1 ; GFX1232_DPP-NEXT: s_mov_b32 s8, exec_lo @@ -7141,9 +7136,10 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: v_mov_b32_e32 v10, v6 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v11, v7 ; GFX1232_DPP-NEXT: v_readfirstlane_b32 s3, v9 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_sub_co_u32 v8, vcc_lo, s2, v10 ; GFX1232_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1232_DPP-NEXT: s_wait_alu 0xfffd ; GFX1232_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, s3, v11, vcc_lo ; GFX1232_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1232_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], null diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll index 905a515d7c125..8c6224cc86284 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll @@ -219,7 +219,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 -; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 @@ -239,6 +238,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W64-NEXT: s_wait_alu 0xf1ff ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -256,7 +256,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_mul_i32 s1, s1, 5 @@ -265,7 +264,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB0_2: -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 @@ -498,7 +496,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 -; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 @@ -518,6 +515,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 +; GFX12W64-NEXT: s_wait_alu 0xf1ff ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3] ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 @@ -536,7 +534,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: s_wait_alu 0xfffe @@ -882,7 +879,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_cbranch_execz .LBB2_4 ; GFX12W64-NEXT: ; %bb.3: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 -; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN @@ -892,7 +888,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX12W64-NEXT: s_wait_alu 0xfffe +; GFX12W64-NEXT: s_wait_alu 0xf1ff ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -907,7 +903,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 @@ -939,6 +934,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 +; GFX12W32-NEXT: s_wait_alu 0xf1ff ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 @@ -1273,7 +1269,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 -; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 @@ -1294,6 +1289,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W64-NEXT: s_wait_alu 0xf1ff ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -1311,7 +1307,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: s_cbranch_execz .LBB4_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_mul_i32 s1, s1, 5 @@ -1320,7 +1315,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB4_2: -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 @@ -1557,7 +1551,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 -; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 @@ -1579,6 +1572,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W64-NEXT: s_wait_alu 0xf1ff ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1596,7 +1590,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: s_wait_alu 0xfffe @@ -1612,6 +1605,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W32-NEXT: s_wait_alu 0xf1ff ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3] @@ -1944,7 +1938,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_cbranch_execz .LBB6_4 ; GFX12W64-NEXT: ; %bb.3: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 -; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN @@ -1954,7 +1947,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX12W64-NEXT: s_wait_alu 0xfffe +; GFX12W64-NEXT: s_wait_alu 0xf1ff ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -1969,7 +1962,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB6_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 @@ -2002,6 +1994,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W32-NEXT: s_wait_alu 0xf1ff ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll index 9801e6ede5eeb..63b46eba41225 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll @@ -226,7 +226,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 -; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 @@ -247,6 +246,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W64-NEXT: s_wait_alu 0xf1ff ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -264,7 +264,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_mul_i32 s1, s1, 5 @@ -273,7 +272,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB0_2: -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 @@ -513,7 +511,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 -; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 @@ -534,6 +531,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 +; GFX12W64-NEXT: s_wait_alu 0xf1ff ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3] ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 @@ -552,7 +550,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: s_wait_alu 0xfffe @@ -905,7 +902,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: ; %bb.3: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 ; GFX12W64-NEXT: v_mov_b32_e32 v2, 0 -; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN @@ -915,7 +911,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX12W64-NEXT: s_wait_alu 0xfffe +; GFX12W64-NEXT: s_wait_alu 0xf1ff ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -930,7 +926,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 @@ -962,6 +957,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 +; GFX12W32-NEXT: s_wait_alu 0xf1ff ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 @@ -1431,7 +1427,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 -; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 @@ -1453,6 +1448,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W64-NEXT: s_wait_alu 0xf1ff ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -1470,7 +1466,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_mul_i32 s1, s1, 5 @@ -1479,7 +1474,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB5_2: -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 @@ -1723,7 +1717,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 -; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 @@ -1746,6 +1739,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W64-NEXT: s_wait_alu 0xf1ff ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1763,7 +1757,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: s_cbranch_execz .LBB6_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: s_wait_alu 0xfffe @@ -1779,6 +1772,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W32-NEXT: s_wait_alu 0xf1ff ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3] @@ -2118,7 +2112,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: ; %bb.3: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 ; GFX12W64-NEXT: v_mov_b32_e32 v2, 0 -; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN @@ -2128,7 +2121,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX12W64-NEXT: s_wait_alu 0xfffe +; GFX12W64-NEXT: s_wait_alu 0xf1ff ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -2143,7 +2136,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB7_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 @@ -2176,6 +2168,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W32-NEXT: s_wait_alu 0xf1ff ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll index ff47c865c67e6..a03ad4daab014 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll @@ -417,12 +417,13 @@ define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 { ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v2, v0, s[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v0, s0, s0, v0 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 ; GFX12-NEXT: s_mov_b32 s0, exec_lo ; GFX12-NEXT: v_cmpx_ne_u32_e32 0, v2 @@ -1495,7 +1496,6 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 ; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll index a969e3d4f4f79..0ea73ad4c5019 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll @@ -382,10 +382,10 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -394,7 +394,6 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX12-NEXT: buffer_atomic_add_f32 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB2_1 ; GFX12-NEXT: ; %bb.2: @@ -402,7 +401,6 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, v5 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -1894,14 +1892,12 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB8_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory: @@ -2145,14 +2141,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB9_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory: @@ -2376,10 +2370,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: v_readfirstlane_b32 s5, v10 ; GFX12-NEXT: v_readfirstlane_b32 s6, v7 ; GFX12-NEXT: v_readfirstlane_b32 s7, v8 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -2387,7 +2381,6 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048 ; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-NEXT: ; %bb.2: @@ -2409,17 +2402,16 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: v_readfirstlane_b32 s5, v10 ; GFX12-NEXT: v_readfirstlane_b32 s6, v7 ; GFX12-NEXT: v_readfirstlane_b32 s7, v8 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB10_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB10_3 Depth=1 @@ -2434,7 +2426,6 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: s_cbranch_execnz .LBB10_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -2921,14 +2912,12 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory: @@ -3192,14 +3181,12 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -3439,7 +3426,6 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX12-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start @@ -3468,7 +3454,6 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: @@ -3790,7 +3775,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX12-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start @@ -3818,7 +3802,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: @@ -4133,17 +4116,16 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-NEXT: ; %bb.2: @@ -4170,17 +4152,16 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB15_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 @@ -4196,7 +4177,6 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v4, v8 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -4779,7 +4759,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start @@ -4795,6 +4774,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX12-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -4816,7 +4796,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -5185,7 +5164,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start @@ -5201,6 +5179,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX12-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -5221,7 +5200,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -5582,17 +5560,16 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_1 ; GFX12-NEXT: ; %bb.2: @@ -5614,6 +5591,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: v_or_b32_e32 v11, 0x400000, v4 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 @@ -5629,17 +5607,16 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 @@ -5655,7 +5632,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -6759,10 +6735,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6771,7 +6747,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX12-NEXT: buffer_atomic_pk_add_f16 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB21_1 ; GFX12-NEXT: ; %bb.2: @@ -6779,7 +6754,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, v5 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -9080,10 +9054,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -9092,7 +9066,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX12-NEXT: buffer_atomic_pk_add_bf16 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB28_1 ; GFX12-NEXT: ; %bb.2: @@ -9100,7 +9073,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, v5 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll index c7511a2df9fe1..7f06d169a6b13 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll @@ -374,10 +374,10 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -386,7 +386,6 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX12-NEXT: buffer_atomic_max_num_f32 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB2_1 ; GFX12-NEXT: ; %bb.2: @@ -394,7 +393,6 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, v5 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -1220,14 +1218,12 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB5_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: @@ -1413,14 +1409,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] ; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB6_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: @@ -1589,10 +1583,10 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: v_readfirstlane_b32 s5, v10 ; GFX12-NEXT: v_readfirstlane_b32 s6, v7 ; GFX12-NEXT: v_readfirstlane_b32 s7, v8 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -1600,7 +1594,6 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048 ; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB7_1 ; GFX12-NEXT: ; %bb.2: @@ -1624,17 +1617,16 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: v_readfirstlane_b32 s5, v10 ; GFX12-NEXT: v_readfirstlane_b32 s6, v7 ; GFX12-NEXT: v_readfirstlane_b32 s7, v8 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB7_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 @@ -1649,7 +1641,6 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: s_cbranch_execnz .LBB7_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -2034,14 +2025,12 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB8_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: @@ -2323,14 +2312,12 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB9_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -2511,7 +2498,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start @@ -2541,7 +2527,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: @@ -2877,7 +2862,6 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start @@ -2906,7 +2890,6 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: @@ -3234,17 +3217,16 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-NEXT: ; %bb.2: @@ -3274,17 +3256,16 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB12_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 @@ -3300,7 +3281,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -3896,7 +3876,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start @@ -3912,6 +3891,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX12-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -3933,7 +3913,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -4304,7 +4283,6 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start @@ -4320,6 +4298,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX12-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -4340,7 +4319,6 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -4703,17 +4681,16 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-NEXT: ; %bb.2: @@ -4735,6 +4712,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: v_or_b32_e32 v11, 0x400000, v4 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 @@ -4750,17 +4728,16 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB15_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 @@ -4776,7 +4753,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -5418,14 +5394,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -5738,14 +5712,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -6038,10 +6010,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6049,7 +6021,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_1 ; GFX12-NEXT: ; %bb.2: @@ -6073,17 +6044,16 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 @@ -6099,7 +6069,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -6673,6 +6642,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX12-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 ; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v2 @@ -6681,6 +6651,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX12-NEXT: v_cmp_u_f32_e64 s4, v0, v0 ; GFX12-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 ; GFX12-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 @@ -6690,14 +6661,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB19_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7099,8 +7068,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-NEXT: v_add3_u32 v6, v6, v0, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 @@ -7110,14 +7081,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB20_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7494,10 +7463,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -7505,7 +7474,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB21_1 ; GFX12-NEXT: ; %bb.2: @@ -7531,12 +7499,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: v_or_b32_e32 v13, 0x400000, v5 ; GFX12-NEXT: v_add3_u32 v11, v11, v5, 0x7fff ; GFX12-NEXT: v_add3_u32 v10, v10, v4, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 ; GFX12-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 @@ -7545,17 +7515,16 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB21_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 @@ -7571,7 +7540,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll index 0bcaacc6b08e8..a6eb81fcbf515 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll @@ -374,10 +374,10 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -386,7 +386,6 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX12-NEXT: buffer_atomic_min_num_f32 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB2_1 ; GFX12-NEXT: ; %bb.2: @@ -394,7 +393,6 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, v5 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -1220,14 +1218,12 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB5_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: @@ -1413,14 +1409,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] ; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB6_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: @@ -1589,10 +1583,10 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: v_readfirstlane_b32 s5, v10 ; GFX12-NEXT: v_readfirstlane_b32 s6, v7 ; GFX12-NEXT: v_readfirstlane_b32 s7, v8 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -1600,7 +1594,6 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048 ; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB7_1 ; GFX12-NEXT: ; %bb.2: @@ -1624,17 +1617,16 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: v_readfirstlane_b32 s5, v10 ; GFX12-NEXT: v_readfirstlane_b32 s6, v7 ; GFX12-NEXT: v_readfirstlane_b32 s7, v8 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB7_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 @@ -1649,7 +1641,6 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: s_cbranch_execnz .LBB7_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -2034,14 +2025,12 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB8_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: @@ -2323,14 +2312,12 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB9_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -2511,7 +2498,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start @@ -2541,7 +2527,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: @@ -2877,7 +2862,6 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start @@ -2906,7 +2890,6 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: @@ -3234,17 +3217,16 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-NEXT: ; %bb.2: @@ -3274,17 +3256,16 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB12_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 @@ -3300,7 +3281,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -3896,7 +3876,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start @@ -3912,6 +3891,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX12-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -3933,7 +3913,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -4304,7 +4283,6 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start @@ -4320,6 +4298,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX12-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -4340,7 +4319,6 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -4703,17 +4681,16 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-NEXT: ; %bb.2: @@ -4735,6 +4712,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: v_or_b32_e32 v11, 0x400000, v4 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 @@ -4750,17 +4728,16 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB15_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 @@ -4776,7 +4753,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -5418,14 +5394,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -5738,14 +5712,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -6038,10 +6010,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6049,7 +6021,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_1 ; GFX12-NEXT: ; %bb.2: @@ -6073,17 +6044,16 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 @@ -6099,7 +6069,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -6673,6 +6642,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX12-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 ; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v2 @@ -6681,6 +6651,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX12-NEXT: v_cmp_u_f32_e64 s4, v0, v0 ; GFX12-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 ; GFX12-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 @@ -6690,14 +6661,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB19_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7099,8 +7068,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-NEXT: v_add3_u32 v6, v6, v0, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 @@ -7110,14 +7081,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB20_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7494,10 +7463,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -7505,7 +7474,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB21_1 ; GFX12-NEXT: ; %bb.2: @@ -7531,12 +7499,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: v_or_b32_e32 v13, 0x400000, v5 ; GFX12-NEXT: v_add3_u32 v11, v11, v5, 0x7fff ; GFX12-NEXT: v_add3_u32 v10, v10, v4, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 ; GFX12-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 @@ -7545,17 +7515,16 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB21_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 @@ -7571,7 +7540,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll index 23b54c6741e51..a00abecabdffb 100644 --- a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll @@ -1807,6 +1807,7 @@ define amdgpu_kernel void @fcmp_v2f16_lt( ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_cmp_lt_f16_e32 vcc_lo, v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null ; GFX12-NEXT: s_endpgm @@ -1940,6 +1941,7 @@ define amdgpu_kernel void @fcmp_v2f16_eq( ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_cmp_eq_f16_e32 vcc_lo, v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null ; GFX12-NEXT: s_endpgm @@ -2072,6 +2074,7 @@ define amdgpu_kernel void @fcmp_v2f16_le( ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_cmp_le_f16_e32 vcc_lo, v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null ; GFX12-NEXT: s_endpgm @@ -2204,6 +2207,7 @@ define amdgpu_kernel void @fcmp_v2f16_gt( ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_cmp_gt_f16_e32 vcc_lo, v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null ; GFX12-NEXT: s_endpgm @@ -2337,6 +2341,7 @@ define amdgpu_kernel void @fcmp_v2f16_lg( ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_cmp_lg_f16_e32 vcc_lo, v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null ; GFX12-NEXT: s_endpgm @@ -2470,6 +2475,7 @@ define amdgpu_kernel void @fcmp_v2f16_ge( ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_cmp_ge_f16_e32 vcc_lo, v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null ; GFX12-NEXT: s_endpgm @@ -2603,6 +2609,7 @@ define amdgpu_kernel void @fcmp_v2f16_o( ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null ; GFX12-NEXT: s_endpgm @@ -2736,6 +2743,7 @@ define amdgpu_kernel void @fcmp_v2f16_u( ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_cmp_u_f16_e32 vcc_lo, v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null ; GFX12-NEXT: s_endpgm @@ -2868,6 +2876,7 @@ define amdgpu_kernel void @fcmp_v2f16_nge( ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_cmp_nge_f16_e32 vcc_lo, v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null ; GFX12-NEXT: s_endpgm @@ -3000,6 +3009,7 @@ define amdgpu_kernel void @fcmp_v2f16_nlg( ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null ; GFX12-NEXT: s_endpgm @@ -3133,6 +3143,7 @@ define amdgpu_kernel void @fcmp_v2f16_ngt( ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null ; GFX12-NEXT: s_endpgm @@ -3265,6 +3276,7 @@ define amdgpu_kernel void @fcmp_v2f16_nle( ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_cmp_nle_f16_e32 vcc_lo, v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null ; GFX12-NEXT: s_endpgm @@ -3397,6 +3409,7 @@ define amdgpu_kernel void @fcmp_v2f16_neq( ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_cmp_neq_f16_e32 vcc_lo, v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null ; GFX12-NEXT: s_endpgm @@ -3529,6 +3542,7 @@ define amdgpu_kernel void @fcmp_v2f16_nlt( ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null ; GFX12-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll index 72f883928cffb..707cae9534830 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll @@ -5705,6 +5705,7 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_cbranch_execz .LBB30_6 ; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -5714,7 +5715,6 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: @@ -6059,6 +6059,7 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo @@ -6075,7 +6076,7 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB31_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB31_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] @@ -6104,6 +6105,7 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execz .LBB31_2 ; GFX12-NEXT: .LBB31_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -6111,7 +6113,6 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6479,6 +6480,7 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo @@ -6495,7 +6497,7 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB32_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB32_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] @@ -6524,6 +6526,7 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execz .LBB32_2 ; GFX12-NEXT: .LBB32_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -6531,7 +6534,6 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6912,7 +6914,6 @@ define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: .LBB33_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB33_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] @@ -6940,6 +6941,7 @@ define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_cbranch_execz .LBB33_2 ; GFX12-NEXT: .LBB33_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -6947,7 +6949,6 @@ define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: @@ -7291,6 +7292,7 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo @@ -7306,7 +7308,7 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB34_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB34_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] @@ -7334,6 +7336,7 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execz .LBB34_2 ; GFX12-NEXT: .LBB34_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -7341,7 +7344,6 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7701,6 +7703,7 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo @@ -7716,7 +7719,7 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB35_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB35_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] @@ -7744,6 +7747,7 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execz .LBB35_2 ; GFX12-NEXT: .LBB35_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -7751,7 +7755,6 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -8150,7 +8153,6 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: @@ -8405,6 +8407,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) @@ -8440,7 +8443,6 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8704,6 +8706,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) @@ -8739,7 +8742,6 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -9036,7 +9038,6 @@ define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_cbranch_execnz .LBB39_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: @@ -9282,6 +9283,7 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) @@ -9315,7 +9317,6 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execnz .LBB40_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9570,6 +9571,7 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) @@ -9603,7 +9605,6 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execnz .LBB41_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -9880,7 +9881,6 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-NEXT: s_cbranch_execnz .LBB42_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -10097,7 +10097,6 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -10297,6 +10296,7 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) @@ -10333,7 +10333,6 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10599,6 +10598,7 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) @@ -10633,7 +10633,6 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_cbranch_execnz .LBB45_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10915,8 +10914,9 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -10936,7 +10936,6 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: @@ -11236,6 +11235,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -11259,8 +11259,9 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -11280,7 +11281,6 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11590,6 +11590,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -11613,8 +11614,9 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -11634,7 +11636,6 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11945,6 +11946,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -11968,6 +11970,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -11987,7 +11990,6 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_cbranch_execnz .LBB49_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12288,6 +12290,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -12311,6 +12314,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -12330,7 +12334,6 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: s_cbranch_execnz .LBB50_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -12646,6 +12649,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -12664,7 +12668,6 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -12928,8 +12931,9 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -12947,7 +12951,6 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX12-NEXT: s_cbranch_execnz .LBB52_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -13215,6 +13218,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -13234,7 +13238,6 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: s_cbranch_execnz .LBB53_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: @@ -13525,6 +13528,7 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -13548,8 +13552,9 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -13570,7 +13575,6 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13882,6 +13886,7 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -13905,6 +13910,7 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -13925,7 +13931,6 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_cbranch_execnz .LBB55_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll index 36aa73fbf8e92..75eb68557b174 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll @@ -2808,6 +2808,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_cbranch_execz .LBB18_6 ; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -2819,7 +2820,6 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: @@ -3161,6 +3161,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo @@ -3177,7 +3178,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB19_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] @@ -3207,6 +3208,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execz .LBB19_2 ; GFX12-NEXT: .LBB19_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3216,7 +3218,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -3573,6 +3574,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo @@ -3589,7 +3591,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB20_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB20_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] @@ -3619,6 +3621,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execz .LBB20_2 ; GFX12-NEXT: .LBB20_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3628,7 +3631,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -3998,7 +4000,6 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: .LBB21_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] @@ -4028,6 +4029,7 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_cbranch_execz .LBB21_2 ; GFX12-NEXT: .LBB21_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4037,7 +4039,6 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: @@ -4372,6 +4373,7 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0x7f8, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo @@ -4387,7 +4389,7 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB22_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB22_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] @@ -4417,6 +4419,7 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execz .LBB22_2 ; GFX12-NEXT: .LBB22_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4426,7 +4429,6 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -4777,6 +4779,7 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo @@ -4792,7 +4795,7 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB23_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB23_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] @@ -4822,6 +4825,7 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execz .LBB23_2 ; GFX12-NEXT: .LBB23_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4831,7 +4835,6 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -5219,6 +5222,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: s_cbranch_execz .LBB24_6 ; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -5230,7 +5234,6 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: @@ -5642,6 +5645,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: s_cbranch_execz .LBB25_6 ; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -5653,7 +5657,6 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -6034,7 +6037,6 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: @@ -6301,6 +6303,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -6338,7 +6341,6 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6614,6 +6616,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -6651,7 +6654,6 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6963,7 +6965,6 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_cbranch_execnz .LBB29_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: @@ -7222,6 +7223,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -7258,7 +7260,6 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execnz .LBB30_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7526,6 +7527,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -7562,7 +7564,6 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execnz .LBB31_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -7856,7 +7857,6 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -8092,7 +8092,6 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-NEXT: s_cbranch_execnz .LBB33_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -8299,6 +8298,7 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -8337,7 +8337,6 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8615,6 +8614,7 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -8652,7 +8652,6 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_cbranch_execnz .LBB35_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8947,8 +8946,9 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -8968,7 +8968,6 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: @@ -9269,6 +9268,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -9292,8 +9292,9 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -9313,7 +9314,6 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9624,6 +9624,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -9647,8 +9648,9 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -9668,7 +9670,6 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10001,6 +10002,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -10020,7 +10022,6 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: s_cbranch_execnz .LBB39_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: @@ -10312,6 +10313,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -10335,6 +10337,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -10354,7 +10357,6 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_cbranch_execnz .LBB40_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10656,6 +10658,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -10679,6 +10682,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -10698,7 +10702,6 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: s_cbranch_execnz .LBB41_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11015,6 +11018,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -11033,7 +11037,6 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -11298,8 +11301,9 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -11317,7 +11321,6 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX12-NEXT: s_cbranch_execnz .LBB43_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -11565,6 +11568,7 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -11588,8 +11592,9 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -11610,7 +11615,6 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11923,6 +11927,7 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -11946,6 +11951,7 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -11966,7 +11972,6 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_cbranch_execnz .LBB45_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12296,7 +12301,6 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: @@ -12532,7 +12536,6 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12771,7 +12774,6 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13027,7 +13029,6 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_cbranch_execnz .LBB49_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -13254,7 +13255,6 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_cbranch_execnz .LBB50_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13488,7 +13488,6 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_cbranch_execnz .LBB51_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13742,7 +13741,6 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13983,7 +13981,6 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_cbranch_execnz .LBB53_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14223,9 +14220,10 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 @@ -14242,7 +14240,6 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: @@ -14569,9 +14566,10 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 @@ -14588,7 +14586,6 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14918,9 +14915,10 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 @@ -14937,7 +14935,6 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -15284,8 +15281,10 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 @@ -15302,7 +15301,6 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: s_cbranch_execnz .LBB57_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -15619,8 +15617,10 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 @@ -15637,7 +15637,6 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_cbranch_execnz .LBB58_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15961,8 +15960,10 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 @@ -15979,7 +15980,6 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: s_cbranch_execnz .LBB59_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -16322,9 +16322,10 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 @@ -16342,7 +16343,6 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16673,8 +16673,10 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 @@ -16692,7 +16694,6 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_cbranch_execnz .LBB61_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll index d96d3db9f005d..a05e4a0cb2396 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll @@ -2808,6 +2808,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_cbranch_execz .LBB18_6 ; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -2819,7 +2820,6 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: @@ -3161,6 +3161,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo @@ -3177,7 +3178,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB19_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] @@ -3207,6 +3208,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execz .LBB19_2 ; GFX12-NEXT: .LBB19_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3216,7 +3218,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -3573,6 +3574,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo @@ -3589,7 +3591,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB20_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB20_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] @@ -3619,6 +3621,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execz .LBB20_2 ; GFX12-NEXT: .LBB20_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3628,7 +3631,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -3998,7 +4000,6 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: .LBB21_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] @@ -4028,6 +4029,7 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_cbranch_execz .LBB21_2 ; GFX12-NEXT: .LBB21_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4037,7 +4039,6 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: @@ -4372,6 +4373,7 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0x7f8, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo @@ -4387,7 +4389,7 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB22_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB22_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] @@ -4417,6 +4419,7 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execz .LBB22_2 ; GFX12-NEXT: .LBB22_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4426,7 +4429,6 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -4777,6 +4779,7 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo @@ -4792,7 +4795,7 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB23_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB23_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] @@ -4822,6 +4825,7 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execz .LBB23_2 ; GFX12-NEXT: .LBB23_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4831,7 +4835,6 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -5219,6 +5222,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: s_cbranch_execz .LBB24_6 ; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -5230,7 +5234,6 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: @@ -5642,6 +5645,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: s_cbranch_execz .LBB25_6 ; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -5653,7 +5657,6 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -6034,7 +6037,6 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: @@ -6301,6 +6303,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -6338,7 +6341,6 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6614,6 +6616,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -6651,7 +6654,6 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6963,7 +6965,6 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_cbranch_execnz .LBB29_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: @@ -7222,6 +7223,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -7258,7 +7260,6 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execnz .LBB30_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7526,6 +7527,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -7562,7 +7564,6 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execnz .LBB31_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -7856,7 +7857,6 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -8092,7 +8092,6 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-NEXT: s_cbranch_execnz .LBB33_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -8299,6 +8298,7 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -8337,7 +8337,6 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8615,6 +8614,7 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -8652,7 +8652,6 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_cbranch_execnz .LBB35_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8947,8 +8946,9 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -8968,7 +8968,6 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: @@ -9269,6 +9268,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -9292,8 +9292,9 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -9313,7 +9314,6 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9624,6 +9624,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -9647,8 +9648,9 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -9668,7 +9670,6 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10001,6 +10002,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -10020,7 +10022,6 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: s_cbranch_execnz .LBB39_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: @@ -10312,6 +10313,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -10335,6 +10337,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -10354,7 +10357,6 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_cbranch_execnz .LBB40_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10656,6 +10658,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -10679,6 +10682,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -10698,7 +10702,6 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: s_cbranch_execnz .LBB41_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11015,6 +11018,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -11033,7 +11037,6 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -11298,8 +11301,9 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -11317,7 +11321,6 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX12-NEXT: s_cbranch_execnz .LBB43_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -11565,6 +11568,7 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -11588,8 +11592,9 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -11610,7 +11615,6 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11923,6 +11927,7 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -11946,6 +11951,7 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -11966,7 +11972,6 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_cbranch_execnz .LBB45_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12296,7 +12301,6 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: @@ -12532,7 +12536,6 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12771,7 +12774,6 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13027,7 +13029,6 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_cbranch_execnz .LBB49_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -13254,7 +13255,6 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_cbranch_execnz .LBB50_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13488,7 +13488,6 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_cbranch_execnz .LBB51_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13742,7 +13741,6 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13983,7 +13981,6 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_cbranch_execnz .LBB53_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14223,9 +14220,10 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 @@ -14242,7 +14240,6 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: @@ -14569,9 +14566,10 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 @@ -14588,7 +14586,6 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14918,9 +14915,10 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 @@ -14937,7 +14935,6 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -15284,8 +15281,10 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 @@ -15302,7 +15301,6 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: s_cbranch_execnz .LBB57_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -15619,8 +15617,10 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 @@ -15637,7 +15637,6 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_cbranch_execnz .LBB58_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15961,8 +15960,10 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 @@ -15979,7 +15980,6 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: s_cbranch_execnz .LBB59_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -16322,9 +16322,10 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 @@ -16342,7 +16343,6 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16673,8 +16673,10 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 @@ -16692,7 +16694,6 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_cbranch_execnz .LBB61_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll index 9c2a76380d83d..cd1a161346667 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll @@ -41,7 +41,6 @@ define float @flat_agent_atomic_fsub_ret_f32(ptr %ptr, float %val) #0 { ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32: @@ -237,7 +236,6 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %val ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos: @@ -437,7 +435,6 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg(ptr %ptr, float %val ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg: @@ -653,7 +650,6 @@ define void @flat_agent_atomic_fsub_noret_f32(ptr %ptr, float %val) #0 { ; GFX12-NEXT: s_cbranch_execnz .LBB3_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32: @@ -839,7 +835,6 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %va ; GFX12-NEXT: s_cbranch_execnz .LBB4_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos: @@ -1032,7 +1027,6 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg(ptr %ptr, float %va ; GFX12-NEXT: s_cbranch_execnz .LBB5_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg: @@ -1246,7 +1240,6 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %va ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos: @@ -1447,7 +1440,6 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %v ; GFX12-NEXT: s_cbranch_execnz .LBB7_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos: @@ -1648,7 +1640,6 @@ define float @flat_agent_atomic_fsub_ret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32__ftz: @@ -1844,7 +1835,6 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, float ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz: @@ -2044,7 +2034,6 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr %ptr, float ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz: @@ -2260,7 +2249,6 @@ define void @flat_agent_atomic_fsub_noret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__ftz: @@ -2446,7 +2434,6 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX12-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz: @@ -2639,7 +2626,6 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr %ptr, floa ; GFX12-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz: @@ -2853,7 +2839,6 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz: @@ -3054,7 +3039,6 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, flo ; GFX12-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz: @@ -3270,6 +3254,7 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX12-NEXT: s_cbranch_execz .LBB16_6 ; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3279,7 +3264,6 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f64: @@ -3642,6 +3626,7 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo @@ -3658,7 +3643,7 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX12-NEXT: .LBB17_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB17_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] @@ -3687,6 +3672,7 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX12-NEXT: s_cbranch_execz .LBB17_2 ; GFX12-NEXT: .LBB17_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3694,7 +3680,6 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX12-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_pos: @@ -4088,6 +4073,7 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo @@ -4104,7 +4090,7 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX12-NEXT: .LBB18_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB18_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] @@ -4133,6 +4119,7 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX12-NEXT: s_cbranch_execz .LBB18_2 ; GFX12-NEXT: .LBB18_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4140,7 +4127,6 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX12-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_neg: @@ -4547,7 +4533,6 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 { ; GFX12-NEXT: .LBB19_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] @@ -4575,6 +4560,7 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 { ; GFX12-NEXT: s_cbranch_execz .LBB19_2 ; GFX12-NEXT: .LBB19_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4582,7 +4568,6 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 { ; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f64: @@ -4952,6 +4937,7 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo @@ -4967,7 +4953,7 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v ; GFX12-NEXT: .LBB20_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB20_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] @@ -4995,6 +4981,7 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v ; GFX12-NEXT: s_cbranch_execz .LBB20_2 ; GFX12-NEXT: .LBB20_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -5002,7 +4989,6 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v ; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_pos: @@ -5388,6 +5374,7 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo @@ -5403,7 +5390,7 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v ; GFX12-NEXT: .LBB21_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] @@ -5431,6 +5418,7 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v ; GFX12-NEXT: s_cbranch_execz .LBB21_2 ; GFX12-NEXT: .LBB21_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -5438,7 +5426,6 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v ; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_neg: @@ -5863,7 +5850,6 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 { ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f16: @@ -6118,6 +6104,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) @@ -6153,7 +6140,6 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos: @@ -6417,6 +6403,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val) ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) @@ -6452,7 +6439,6 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val) ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_neg: @@ -6749,7 +6735,6 @@ define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 { ; GFX12-NEXT: s_cbranch_execnz .LBB25_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f16: @@ -6995,6 +6980,7 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) @@ -7028,7 +7014,6 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val ; GFX12-NEXT: s_cbranch_execnz .LBB26_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos: @@ -7283,6 +7268,7 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) @@ -7316,7 +7302,6 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val ; GFX12-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg: @@ -7595,7 +7580,6 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4: @@ -7816,7 +7800,6 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h ; GFX12-NEXT: s_cbranch_execnz .LBB29_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos: @@ -8010,6 +7993,7 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) @@ -8046,7 +8030,6 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fsub_ret_f16__offset12b_pos: @@ -8312,6 +8295,7 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) @@ -8346,7 +8330,6 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va ; GFX12-NEXT: s_cbranch_execnz .LBB31_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos: @@ -8628,8 +8611,9 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -8649,7 +8633,6 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_bf16: @@ -8949,6 +8932,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -8972,8 +8956,9 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -8993,7 +8978,6 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos: @@ -9303,6 +9287,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -9326,8 +9311,9 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -9347,7 +9333,6 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_neg: @@ -9679,6 +9664,7 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -9698,7 +9684,6 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX12-NEXT: s_cbranch_execnz .LBB35_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_bf16: @@ -9989,6 +9974,7 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -10012,6 +9998,7 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -10031,7 +10018,6 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX12-NEXT: s_cbranch_execnz .LBB36_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos: @@ -10332,6 +10318,7 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -10355,6 +10342,7 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -10374,7 +10362,6 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX12-NEXT: s_cbranch_execnz .LBB37_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg: @@ -10690,6 +10677,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr, ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -10708,7 +10696,6 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr, ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4: @@ -10972,8 +10959,9 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr, ; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -10991,7 +10979,6 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr, ; GFX12-NEXT: s_cbranch_execnz .LBB39_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: @@ -11238,6 +11225,7 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -11261,8 +11249,9 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -11283,7 +11272,6 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fsub_ret_bf16__offset12b_pos: @@ -11595,6 +11583,7 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -11618,6 +11607,7 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -11638,7 +11628,6 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX12-NEXT: s_cbranch_execnz .LBB41_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos: @@ -11965,7 +11954,6 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16(ptr %ptr, <2 x half> %val) # ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2f16: @@ -12186,7 +12174,6 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_pos: @@ -12410,7 +12397,6 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_neg: @@ -12650,7 +12636,6 @@ define void @flat_agent_atomic_fsub_noret_v2f16(ptr %ptr, <2 x half> %val) #0 { ; GFX12-NEXT: s_cbranch_execnz .LBB45_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2f16: @@ -12860,7 +12845,6 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha ; GFX12-NEXT: s_cbranch_execnz .LBB46_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_pos: @@ -13077,7 +13061,6 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha ; GFX12-NEXT: s_cbranch_execnz .LBB47_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_neg: @@ -13315,7 +13298,6 @@ define <2 x half> @flat_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fsub_ret_v2f16__offset12b_pos: @@ -13540,7 +13522,6 @@ define void @flat_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h ; GFX12-NEXT: s_cbranch_execnz .LBB49_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fsub_noret_v2f16__offset12b_pos: @@ -13766,9 +13747,10 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 @@ -13785,7 +13767,6 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2bf16: @@ -14112,9 +14093,10 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 @@ -14131,7 +14113,6 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos: @@ -14461,9 +14442,10 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 @@ -14480,7 +14462,6 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg: @@ -14827,8 +14808,10 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 @@ -14845,7 +14828,6 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX12-NEXT: s_cbranch_execnz .LBB53_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2bf16: @@ -15162,8 +15144,10 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 @@ -15180,7 +15164,6 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX12-NEXT: s_cbranch_execnz .LBB54_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos: @@ -15504,8 +15487,10 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 @@ -15522,7 +15507,6 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX12-NEXT: s_cbranch_execnz .LBB55_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg: @@ -15865,9 +15849,10 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 @@ -15885,7 +15870,6 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos: @@ -16216,8 +16200,10 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 @@ -16235,7 +16221,6 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX12-NEXT: s_cbranch_execnz .LBB57_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos: diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll index b4d7ff8e7c526..0b6bdedeb48fc 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll @@ -128,7 +128,6 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB0_2 ; GFX12-NEXT: .LBB0_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -271,7 +270,6 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: .LBB1_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB1_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -428,7 +426,6 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB2_2 ; GFX12-NEXT: .LBB2_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -577,7 +574,6 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: .LBB3_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB3_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -1002,7 +998,6 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB6_2 ; GFX12-NEXT: .LBB6_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -1145,7 +1140,6 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: .LBB7_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB7_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -1287,7 +1281,6 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB8_2 ; GFX12-NEXT: .LBB8_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -1428,7 +1421,6 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: .LBB9_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB9_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -1583,7 +1575,6 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB10_2 ; GFX12-NEXT: .LBB10_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -1730,7 +1721,6 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: .LBB11_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB11_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -2149,7 +2139,6 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB14_2 ; GFX12-NEXT: .LBB14_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -2290,7 +2279,6 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: .LBB15_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB15_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -2434,7 +2422,6 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB16_2 ; GFX12-NEXT: .LBB16_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -2577,7 +2564,6 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: .LBB17_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB17_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -2734,7 +2720,6 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB18_2 ; GFX12-NEXT: .LBB18_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -2883,7 +2868,6 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: .LBB19_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB19_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -3308,7 +3292,6 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB22_2 ; GFX12-NEXT: .LBB22_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -3451,7 +3434,6 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: .LBB23_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB23_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -3595,7 +3577,6 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_cbranch_execnz .LBB24_2 ; GFX12-NEXT: .LBB24_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -3741,7 +3722,6 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: .LBB25_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB25_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -3899,7 +3879,6 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_cbranch_execnz .LBB26_2 ; GFX12-NEXT: .LBB26_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -4051,7 +4030,6 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: .LBB27_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB27_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -4481,7 +4459,6 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_cbranch_execnz .LBB30_2 ; GFX12-NEXT: .LBB30_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -4627,7 +4604,6 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: .LBB31_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB31_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -4772,7 +4748,6 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_cbranch_execnz .LBB32_2 ; GFX12-NEXT: .LBB32_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -4918,7 +4893,6 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: .LBB33_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB33_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -5076,7 +5050,6 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_cbranch_execnz .LBB34_2 ; GFX12-NEXT: .LBB34_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -5228,7 +5201,6 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX12-NEXT: .LBB35_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB35_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -5658,7 +5630,6 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_cbranch_execnz .LBB38_2 ; GFX12-NEXT: .LBB38_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -5804,7 +5775,6 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: .LBB39_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB39_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -5949,7 +5919,6 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_cbranch_execnz .LBB40_2 ; GFX12-NEXT: .LBB40_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -6095,7 +6064,6 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: .LBB41_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB41_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -6253,7 +6221,6 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_cbranch_execnz .LBB42_2 ; GFX12-NEXT: .LBB42_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -6405,7 +6372,6 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: .LBB43_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB43_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -6835,7 +6801,6 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_cbranch_execnz .LBB46_2 ; GFX12-NEXT: .LBB46_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -6981,7 +6946,6 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: .LBB47_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB47_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -7126,7 +7090,6 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_cbranch_execnz .LBB48_2 ; GFX12-NEXT: .LBB48_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -7272,7 +7235,6 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: .LBB49_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB49_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -7430,7 +7392,6 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_cbranch_execnz .LBB50_2 ; GFX12-NEXT: .LBB50_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -7582,7 +7543,6 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX12-NEXT: .LBB51_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB51_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -8012,7 +7972,6 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_cbranch_execnz .LBB54_2 ; GFX12-NEXT: .LBB54_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -8158,7 +8117,6 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: .LBB55_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB55_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -8301,7 +8259,6 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB56_2 ; GFX12-NEXT: .LBB56_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -8442,7 +8399,6 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in ; GFX12-NEXT: .LBB57_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB57_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -8597,7 +8553,6 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB58_2 ; GFX12-NEXT: .LBB58_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -8744,7 +8699,6 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: .LBB59_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB59_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -9163,7 +9117,6 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB62_2 ; GFX12-NEXT: .LBB62_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -9304,7 +9257,6 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in ; GFX12-NEXT: .LBB63_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB63_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -9439,7 +9391,6 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: s_cbranch_execnz .LBB64_2 ; GFX12-NEXT: .LBB64_4: ; %atomicrmw.private ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 @@ -9566,7 +9517,6 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) { ; GFX12-NEXT: s_cbranch_execnz .LBB65_2 ; GFX12-NEXT: .LBB65_4: ; %atomicrmw.private ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 @@ -9693,7 +9643,6 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) { ; GFX12-NEXT: s_cbranch_execnz .LBB66_2 ; GFX12-NEXT: .LBB66_4: ; %atomicrmw.private ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 @@ -9828,7 +9777,6 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: .LBB67_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB67_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 @@ -9975,7 +9923,6 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-NEXT: s_cbranch_execnz .LBB68_2 ; GFX12-NEXT: .LBB68_4: ; %atomicrmw.private ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 @@ -10116,7 +10063,6 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX12-NEXT: .LBB69_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB69_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 @@ -10513,7 +10459,6 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_cbranch_execnz .LBB72_2 ; GFX12-NEXT: .LBB72_4: ; %atomicrmw.private ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 @@ -10648,7 +10593,6 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: .LBB73_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB73_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 @@ -10789,7 +10733,6 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB74_2 ; GFX12-NEXT: .LBB74_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -10930,7 +10873,6 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: .LBB75_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB75_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -11085,7 +11027,6 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB76_2 ; GFX12-NEXT: .LBB76_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -11232,7 +11173,6 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: .LBB77_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB77_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -11651,7 +11591,6 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB80_2 ; GFX12-NEXT: .LBB80_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -11792,7 +11731,6 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: .LBB81_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB81_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -12355,7 +12293,6 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB90_2 ; GFX12-NEXT: .LBB90_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -12507,7 +12444,6 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB91_2 ; GFX12-NEXT: .LBB91_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -12654,7 +12590,6 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6 ; GFX12-NEXT: .LBB92_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB92_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -12816,7 +12751,6 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB93_2 ; GFX12-NEXT: .LBB93_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -12977,7 +12911,6 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o ; GFX12-NEXT: .LBB94_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB94_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX12-NEXT: s_cselect_b32 s2, s2, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 @@ -13425,7 +13358,6 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB97_2 ; GFX12-NEXT: .LBB97_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -13580,7 +13512,6 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6 ; GFX12-NEXT: .LBB98_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB98_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX12-NEXT: s_cselect_b32 s2, s2, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 @@ -14138,7 +14069,6 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB107_2 ; GFX12-NEXT: .LBB107_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -14146,6 +14076,7 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1 ; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2 ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 @@ -14288,7 +14219,6 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: .LBB108_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB108_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -14296,6 +14226,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1 ; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_dual_cndmask_b32 v3, 0, v3 :: v_dual_cndmask_b32 v2, 0, v2 ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 @@ -14452,7 +14383,6 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB109_2 ; GFX12-NEXT: .LBB109_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -14460,6 +14390,7 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1 ; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2 ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 @@ -14608,7 +14539,6 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: .LBB110_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB110_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -14616,6 +14546,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1 ; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_dual_cndmask_b32 v3, 0, v3 :: v_dual_cndmask_b32 v2, 0, v2 ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 @@ -14762,6 +14693,7 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) { ; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1 ; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2 ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 @@ -14905,6 +14837,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1 ; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_dual_cndmask_b32 v3, 0, v3 :: v_dual_cndmask_b32 v2, 0, v2 ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 @@ -15054,7 +14987,6 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB113_2 ; GFX12-NEXT: .LBB113_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -15062,6 +14994,7 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1 ; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2 ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 @@ -15204,7 +15137,6 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: .LBB114_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB114_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -15212,6 +15144,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1 ; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_dual_cndmask_b32 v3, 0, v3 :: v_dual_cndmask_b32 v2, 0, v2 ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 @@ -15361,7 +15294,6 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB115_2 ; GFX12-NEXT: .LBB115_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s4, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s4 @@ -15369,11 +15301,11 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], v[0:1] ; GFX12-NEXT: v_add_co_u32 v0, s1, v0, -1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 ; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4 @@ -15524,7 +15456,6 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: .LBB116_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB116_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s6, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s6 @@ -15532,11 +15463,11 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[4:5], v[0:1] ; GFX12-NEXT: v_add_co_u32 v2, s1, v0, -1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0 ; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0 ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6 @@ -15699,7 +15630,6 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB117_2 ; GFX12-NEXT: .LBB117_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s4, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s4 @@ -15707,11 +15637,11 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], v[0:1] ; GFX12-NEXT: v_add_co_u32 v0, s1, v0, -1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 ; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4 @@ -15868,7 +15798,6 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: .LBB118_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB118_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s6, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s6 @@ -15876,11 +15805,11 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[4:5], v[0:1] ; GFX12-NEXT: v_add_co_u32 v2, s1, v0, -1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0 ; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0 ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6 @@ -16033,11 +15962,11 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) { ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], v[0:1] ; GFX12-NEXT: v_add_co_u32 v0, s1, v0, -1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 ; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4 @@ -16189,11 +16118,11 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[4:5], v[0:1] ; GFX12-NEXT: v_add_co_u32 v2, s1, v0, -1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0 ; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0 ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6 @@ -16349,7 +16278,6 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB121_2 ; GFX12-NEXT: .LBB121_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s4, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s4 @@ -16357,11 +16285,11 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], v[0:1] ; GFX12-NEXT: v_add_co_u32 v0, s1, v0, -1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 ; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4 @@ -16512,7 +16440,6 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: .LBB122_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB122_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s6, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s6 @@ -16520,11 +16447,11 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[4:5], v[0:1] ; GFX12-NEXT: v_add_co_u32 v2, s1, v0, -1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0 ; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0 ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6 diff --git a/llvm/test/CodeGen/AMDGPU/fma.f16.ll b/llvm/test/CodeGen/AMDGPU/fma.f16.ll index 822d40f7349b0..52a23690dcf53 100644 --- a/llvm/test/CodeGen/AMDGPU/fma.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fma.f16.ll @@ -250,6 +250,7 @@ define i32 @test_D139469_f16(half %arg) { ; GFX12-SDAG-NEXT: v_min_num_f16_e32 v0, v2, v1 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -400,9 +401,11 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) { ; GFX12-SDAG-NEXT: v_pk_min_num_f16 v0, v1, v0 ; GFX12-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX12-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll index f0fa621e3b4bc..c26e2911ab3ea 100644 --- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll @@ -80,6 +80,7 @@ define amdgpu_ps i32 @s_fmaximum3_f32(float inreg %a, float inreg %b, float inre ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_maximum3_f32 v0, s0, s1, v0 ; GFX12-NEXT: v_readfirstlane_b32 s0, v0 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: ; return to shader part epilog ; ; GFX940-LABEL: s_fmaximum3_f32: @@ -1307,6 +1308,7 @@ define amdgpu_ps i32 @s_fmaximum3_f16(half inreg %a, half inreg %b, half inreg % ; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_readfirstlane_b32 s0, v0 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fmaximum3_f16: @@ -3005,6 +3007,7 @@ define amdgpu_ps <2 x i32> @s_fmaximum3_f64(double inreg %a, double inreg %b, do ; GFX12-NEXT: v_readfirstlane_b32 s0, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_readfirstlane_b32 s1, v1 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fmaximum3_f64: diff --git a/llvm/test/CodeGen/AMDGPU/fminimum3.ll b/llvm/test/CodeGen/AMDGPU/fminimum3.ll index 7a8a224c76a83..234a07849a911 100644 --- a/llvm/test/CodeGen/AMDGPU/fminimum3.ll +++ b/llvm/test/CodeGen/AMDGPU/fminimum3.ll @@ -80,6 +80,7 @@ define amdgpu_ps i32 @s_fminimum3_f32(float inreg %a, float inreg %b, float inre ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_minimum3_f32 v0, s0, s1, v0 ; GFX12-NEXT: v_readfirstlane_b32 s0, v0 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: ; return to shader part epilog ; ; GFX940-LABEL: s_fminimum3_f32: @@ -1307,6 +1308,7 @@ define amdgpu_ps i32 @s_fminimum3_f16(half inreg %a, half inreg %b, half inreg % ; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_readfirstlane_b32 s0, v0 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fminimum3_f16: @@ -3005,6 +3007,7 @@ define amdgpu_ps <2 x i32> @s_fminimum3_f64(double inreg %a, double inreg %b, do ; GFX12-NEXT: v_readfirstlane_b32 s0, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_readfirstlane_b32 s1, v1 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fminimum3_f64: diff --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll index 6fc2029724640..fcaf427f6c010 100644 --- a/llvm/test/CodeGen/AMDGPU/fract-match.ll +++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll @@ -115,6 +115,7 @@ define float @safe_math_fract_f32(float %x, ptr addrspace(1) writeonly captures( ; GFX12-NEXT: v_fract_f32_e32 v3, v0 ; GFX12-NEXT: v_cmp_neq_f32_e64 vcc_lo, 0x7f800000, |v0| ; GFX12-NEXT: v_floor_f32_e32 v4, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc_lo ; GFX12-NEXT: global_store_b32 v[1:2], v4, off @@ -301,10 +302,11 @@ define float @no_nan_check_math_fract_f32(float %x, ptr addrspace(1) writeonly c ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_floor_f32_e32 v3, v0 ; GFX12-NEXT: v_cmp_neq_f32_e64 vcc_lo, 0x7f800000, |v0| -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_sub_f32_e32 v4, v0, v3 ; GFX12-NEXT: global_store_b32 v[1:2], v3, off ; GFX12-NEXT: v_min_num_f32_e32 v4, 0x3f7fffff, v4 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] entry: @@ -1423,6 +1425,7 @@ define float @wrong_commuted_nan_select_f32(float %x) { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_sub_f32_e32 v1, v0, v1 ; GFX12-NEXT: v_min_num_f32_e32 v1, 0x3f7fffff, v1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2118,10 +2121,11 @@ define <2 x float> @safe_math_fract_v2f32(<2 x float> %x, ptr addrspace(1) write ; GFX12-NEXT: v_fract_f32_e32 v7, v1 ; GFX12-NEXT: v_floor_f32_e32 v4, v0 ; GFX12-NEXT: v_floor_f32_e32 v5, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v0, v6, 0, s0 ; GFX12-NEXT: v_cmp_class_f32_e64 s0, v1, 0x204 ; GFX12-NEXT: global_store_b64 v[2:3], v[4:5], off +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v1, v7, 0, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] entry: @@ -2247,6 +2251,7 @@ define double @safe_math_fract_f64(double %x, ptr addrspace(1) writeonly capture ; GFX12-NEXT: v_fract_f64_e32 v[4:5], v[0:1] ; GFX12-NEXT: v_cmp_neq_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]| ; GFX12-NEXT: v_floor_f64_e32 v[6:7], v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_dual_cndmask_b32 v0, 0, v4 :: v_dual_cndmask_b32 v1, 0, v5 ; GFX12-NEXT: global_store_b64 v[2:3], v[6:7], off @@ -2383,6 +2388,7 @@ define half @safe_math_fract_f16(half %x, ptr addrspace(1) writeonly captures(no ; GFX12-NEXT: v_fract_f16_e32 v3, v0 ; GFX12-NEXT: v_cmp_neq_f16_e64 vcc_lo, 0x7c00, |v0| ; GFX12-NEXT: v_floor_f16_e32 v4, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc_lo ; GFX12-NEXT: global_store_b16 v[1:2], v4, off @@ -2564,14 +2570,16 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) writeon ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX12-NEXT: v_fract_f16_e32 v6, v0 ; GFX12-NEXT: v_floor_f16_e32 v5, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_fract_f16_e32 v4, v3 ; GFX12-NEXT: v_cmp_class_f16_e64 s0, v3, 0x204 ; GFX12-NEXT: v_floor_f16_e32 v7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v3, v4, 0, s0 ; GFX12-NEXT: v_cmp_class_f16_e64 s0, v0, 0x204 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_pack_b32_f16 v4, v5, v7 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v0, v6, 0, s0 ; GFX12-NEXT: global_store_b32 v[1:2], v4, off ; GFX12-NEXT: v_pack_b32_f16 v0, v0, v3 @@ -2733,6 +2741,7 @@ define <2 x double> @safe_math_fract_v2f64(<2 x double> %x, ptr addrspace(1) wri ; GFX12-NEXT: v_cmp_class_f64_e64 s1, v[2:3], 0x204 ; GFX12-NEXT: v_floor_f64_e32 v[8:9], v[2:3] ; GFX12-NEXT: v_floor_f64_e32 v[6:7], v[0:1] +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v0, v10, 0, s0 ; GFX12-NEXT: v_cndmask_b32_e64 v1, v11, 0, s0 ; GFX12-NEXT: v_cndmask_b32_e64 v2, v12, 0, s1 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index 2be6bf302d35f..73b4428b03c81 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -6936,7 +6936,6 @@ define double @global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: @@ -7158,7 +7157,6 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7381,7 +7379,6 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -7610,7 +7607,6 @@ define void @global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_cbranch_execnz .LBB41_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: @@ -7814,7 +7810,6 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_cbranch_execnz .LBB42_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8021,7 +8016,6 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_cbranch_execnz .LBB43_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -8256,7 +8250,6 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: @@ -8561,6 +8554,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) @@ -8596,7 +8590,6 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8912,6 +8905,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) @@ -8947,7 +8941,6 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -9296,7 +9289,6 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_cbranch_execnz .LBB47_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: @@ -9591,6 +9583,7 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) @@ -9624,7 +9617,6 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_cbranch_execnz .LBB48_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9929,6 +9921,7 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) @@ -9962,7 +9955,6 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_cbranch_execnz .LBB49_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10291,7 +10283,6 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -10551,7 +10542,6 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-NEXT: s_cbranch_execnz .LBB51_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -10782,6 +10772,7 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) @@ -10818,7 +10809,6 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11136,6 +11126,7 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) @@ -11170,7 +11161,6 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_cbranch_execnz .LBB53_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11502,8 +11492,9 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -11523,7 +11514,6 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: @@ -11873,6 +11863,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -11896,8 +11887,9 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -11917,7 +11909,6 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12279,6 +12270,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -12302,8 +12294,9 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -12323,7 +12316,6 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -12707,6 +12699,7 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -12726,7 +12719,6 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: s_cbranch_execnz .LBB57_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: @@ -13066,6 +13058,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -13089,6 +13082,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -13108,7 +13102,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_cbranch_execnz .LBB58_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13459,6 +13452,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -13482,6 +13476,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -13501,7 +13496,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: s_cbranch_execnz .LBB59_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13867,6 +13861,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -13885,7 +13880,6 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -14188,8 +14182,9 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -14207,7 +14202,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX12-NEXT: s_cbranch_execnz .LBB61_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -14491,6 +14485,7 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -14514,8 +14509,9 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -14536,7 +14532,6 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14900,6 +14895,7 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -14923,6 +14919,7 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -14943,7 +14940,6 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_cbranch_execnz .LBB63_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -23072,7 +23068,6 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 { ; GFX12-NEXT: s_cbranch_execz .LBB92_2 ; GFX12-NEXT: ; %bb.1: ; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_bcnt1_i32_b32 s0, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_alu 0xfffe diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll index e2fde562d36b1..cd6ed1e6b98c2 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll @@ -3013,7 +3013,6 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: @@ -3186,7 +3185,6 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -3360,7 +3358,6 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -3533,7 +3530,6 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_cbranch_execnz .LBB21_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: @@ -3696,7 +3692,6 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_cbranch_execnz .LBB22_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -3862,7 +3857,6 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_cbranch_execnz .LBB23_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -4029,7 +4023,6 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: @@ -4278,7 +4271,6 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -4469,7 +4461,6 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: @@ -4786,6 +4777,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -4823,7 +4815,6 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -5151,6 +5142,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -5188,7 +5180,6 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -5552,7 +5543,6 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_cbranch_execnz .LBB29_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: @@ -5860,6 +5850,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -5896,7 +5887,6 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_cbranch_execnz .LBB30_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6214,6 +6204,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -6250,7 +6241,6 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_cbranch_execnz .LBB31_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6594,7 +6584,6 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -6869,7 +6858,6 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-NEXT: s_cbranch_execnz .LBB33_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -7113,6 +7101,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -7151,7 +7140,6 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7481,6 +7469,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -7518,7 +7507,6 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_cbranch_execnz .LBB35_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7863,8 +7851,9 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -7884,7 +7873,6 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: @@ -8236,6 +8224,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -8259,8 +8248,9 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -8280,7 +8270,6 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8644,6 +8633,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -8667,8 +8657,9 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -8688,7 +8679,6 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -9074,6 +9064,7 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -9093,7 +9084,6 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: s_cbranch_execnz .LBB39_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: @@ -9435,6 +9425,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -9458,6 +9449,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -9477,7 +9469,6 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_cbranch_execnz .LBB40_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9830,6 +9821,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -9853,6 +9845,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -9872,7 +9865,6 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: s_cbranch_execnz .LBB41_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10240,6 +10232,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -10258,7 +10251,6 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -10563,8 +10555,9 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -10582,7 +10575,6 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX12-NEXT: s_cbranch_execnz .LBB43_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -10868,6 +10860,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -10891,8 +10884,9 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -10913,7 +10907,6 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11279,6 +11272,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -11302,6 +11296,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -11322,7 +11317,6 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_cbranch_execnz .LBB45_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11703,7 +11697,6 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: @@ -11996,7 +11989,6 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12291,7 +12283,6 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -12589,7 +12580,6 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX12-NEXT: s_cbranch_execnz .LBB49_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -12871,7 +12861,6 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_cbranch_execnz .LBB50_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13156,7 +13145,6 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX12-NEXT: s_cbranch_execnz .LBB51_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13451,7 +13439,6 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13748,7 +13735,6 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: s_cbranch_execnz .LBB53_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14039,9 +14025,10 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 @@ -14058,7 +14045,6 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: @@ -14438,9 +14424,10 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 @@ -14457,7 +14444,6 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14839,9 +14825,10 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 @@ -14858,7 +14845,6 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -15243,8 +15229,10 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 @@ -15261,7 +15249,6 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX12-NEXT: s_cbranch_execnz .LBB57_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -15629,8 +15616,10 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 @@ -15647,7 +15636,6 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: s_cbranch_execnz .LBB58_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16018,8 +16006,10 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 @@ -16036,7 +16026,6 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX12-NEXT: s_cbranch_execnz .LBB59_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -16416,9 +16405,10 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 @@ -16436,7 +16426,6 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16819,8 +16808,10 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 @@ -16838,7 +16829,6 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: s_cbranch_execnz .LBB61_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll index 903e80b15814f..b49047c54d7dd 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll @@ -3013,7 +3013,6 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: @@ -3186,7 +3185,6 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -3360,7 +3358,6 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -3533,7 +3530,6 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_cbranch_execnz .LBB21_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: @@ -3696,7 +3692,6 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_cbranch_execnz .LBB22_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -3862,7 +3857,6 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_cbranch_execnz .LBB23_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -4029,7 +4023,6 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: @@ -4278,7 +4271,6 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -4469,7 +4461,6 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: @@ -4786,6 +4777,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -4823,7 +4815,6 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -5151,6 +5142,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -5188,7 +5180,6 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -5552,7 +5543,6 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_cbranch_execnz .LBB29_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: @@ -5860,6 +5850,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -5896,7 +5887,6 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_cbranch_execnz .LBB30_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6214,6 +6204,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -6250,7 +6241,6 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_cbranch_execnz .LBB31_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6594,7 +6584,6 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -6869,7 +6858,6 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-NEXT: s_cbranch_execnz .LBB33_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -7113,6 +7101,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -7151,7 +7140,6 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7481,6 +7469,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -7518,7 +7507,6 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_cbranch_execnz .LBB35_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7863,8 +7851,9 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -7884,7 +7873,6 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: @@ -8236,6 +8224,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -8259,8 +8248,9 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -8280,7 +8270,6 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8644,6 +8633,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -8667,8 +8657,9 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -8688,7 +8679,6 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -9074,6 +9064,7 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -9093,7 +9084,6 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: s_cbranch_execnz .LBB39_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: @@ -9435,6 +9425,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -9458,6 +9449,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -9477,7 +9469,6 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_cbranch_execnz .LBB40_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9830,6 +9821,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -9853,6 +9845,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -9872,7 +9865,6 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: s_cbranch_execnz .LBB41_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10240,6 +10232,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -10258,7 +10251,6 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -10563,8 +10555,9 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -10582,7 +10575,6 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX12-NEXT: s_cbranch_execnz .LBB43_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -10868,6 +10860,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -10891,8 +10884,9 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -10913,7 +10907,6 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11279,6 +11272,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -11302,6 +11296,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -11322,7 +11317,6 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_cbranch_execnz .LBB45_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11703,7 +11697,6 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: @@ -11996,7 +11989,6 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12291,7 +12283,6 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -12589,7 +12580,6 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX12-NEXT: s_cbranch_execnz .LBB49_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -12871,7 +12861,6 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_cbranch_execnz .LBB50_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13156,7 +13145,6 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX12-NEXT: s_cbranch_execnz .LBB51_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13451,7 +13439,6 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13748,7 +13735,6 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: s_cbranch_execnz .LBB53_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14039,9 +14025,10 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 @@ -14058,7 +14045,6 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: @@ -14438,9 +14424,10 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 @@ -14457,7 +14444,6 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14839,9 +14825,10 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 @@ -14858,7 +14845,6 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -15243,8 +15229,10 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 @@ -15261,7 +15249,6 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX12-NEXT: s_cbranch_execnz .LBB57_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -15629,8 +15616,10 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 @@ -15647,7 +15636,6 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: s_cbranch_execnz .LBB58_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16018,8 +16006,10 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 @@ -16036,7 +16026,6 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX12-NEXT: s_cbranch_execnz .LBB59_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -16416,9 +16405,10 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 @@ -16436,7 +16426,6 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16819,8 +16808,10 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 @@ -16838,7 +16829,6 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: s_cbranch_execnz .LBB61_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll index 24791b60bfc6d..5577029f502d0 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll @@ -42,7 +42,6 @@ define float @global_agent_atomic_fsub_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_f32: @@ -274,7 +273,6 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_pos: @@ -508,7 +506,6 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_neg: @@ -750,7 +747,6 @@ define void @global_agent_atomic_fsub_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX12-NEXT: s_cbranch_execnz .LBB3_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f32: @@ -971,7 +967,6 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_cbranch_execnz .LBB4_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_pos: @@ -1195,7 +1190,6 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_cbranch_execnz .LBB5_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_neg: @@ -1430,7 +1424,6 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fsub_ret_f32__offset12b_pos: @@ -1665,7 +1658,6 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_cbranch_execnz .LBB7_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fsub_noret_f32__offset12b_pos: @@ -1897,7 +1889,6 @@ define float @global_agent_atomic_fsub_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_f32__ftz: @@ -2129,7 +2120,6 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz: @@ -2363,7 +2353,6 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz: @@ -2605,7 +2594,6 @@ define void @global_agent_atomic_fsub_noret_f32__ftz(ptr addrspace(1) %ptr, floa ; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f32__ftz: @@ -2826,7 +2814,6 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspac ; GFX12-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz: @@ -3050,7 +3037,6 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX12-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz: @@ -3285,7 +3271,6 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspac ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fsub_ret_f32__offset12b_pos__ftz: @@ -3520,7 +3505,6 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX12-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fsub_noret_f32__offset12b_pos__ftz: @@ -3752,7 +3736,6 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_f64: @@ -4004,7 +3987,6 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_f64__offset12b_pos: @@ -4257,7 +4239,6 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_f64__offset12b_neg: @@ -4516,7 +4497,6 @@ define void @global_agent_atomic_fsub_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX12-NEXT: s_cbranch_execnz .LBB19_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f64: @@ -4746,7 +4726,6 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_cbranch_execnz .LBB20_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f64__offset12b_pos: @@ -4979,7 +4958,6 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_cbranch_execnz .LBB21_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f64__offset12b_neg: @@ -5240,7 +5218,6 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_f16: @@ -5545,6 +5522,7 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) @@ -5580,7 +5558,6 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos: @@ -5896,6 +5873,7 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) @@ -5931,7 +5909,6 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_neg: @@ -6280,7 +6257,6 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX12-NEXT: s_cbranch_execnz .LBB25_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f16: @@ -6575,6 +6551,7 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) @@ -6608,7 +6585,6 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_cbranch_execnz .LBB26_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_pos: @@ -6913,6 +6889,7 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) @@ -6946,7 +6923,6 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_neg: @@ -7275,7 +7251,6 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos__align4: @@ -7535,7 +7510,6 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs ; GFX12-NEXT: s_cbranch_execnz .LBB29_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f16__offset12b__align4_pos: @@ -7766,6 +7740,7 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) @@ -7802,7 +7777,6 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fsub_ret_f16__offset12b_pos: @@ -8120,6 +8094,7 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) @@ -8154,7 +8129,6 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_cbranch_execnz .LBB31_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fsub_noret_f16__offset12b_pos: @@ -8486,8 +8460,9 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -8507,7 +8482,6 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_bf16: @@ -8857,6 +8831,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -8880,8 +8855,9 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -8901,7 +8877,6 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos: @@ -9263,6 +9238,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -9286,8 +9262,9 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -9307,7 +9284,6 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_neg: @@ -9691,6 +9667,7 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -9710,7 +9687,6 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: s_cbranch_execnz .LBB35_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_bf16: @@ -10050,6 +10026,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -10073,6 +10050,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -10092,7 +10070,6 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_cbranch_execnz .LBB36_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_pos: @@ -10443,6 +10420,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -10466,6 +10444,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -10485,7 +10464,6 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_cbranch_execnz .LBB37_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_neg: @@ -10851,6 +10829,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -10869,7 +10848,6 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4: @@ -11172,8 +11150,9 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr ; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -11191,7 +11170,6 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr ; GFX12-NEXT: s_cbranch_execnz .LBB39_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: @@ -11475,6 +11453,7 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -11498,8 +11477,9 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -11520,7 +11500,6 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fsub_ret_bf16__offset12b_pos: @@ -11884,6 +11863,7 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -11907,6 +11887,7 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -11927,7 +11908,6 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: s_cbranch_execnz .LBB41_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fsub_noret_bf16__offset12b_pos: @@ -12304,7 +12284,6 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_v2f16: @@ -12582,7 +12561,6 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_v2f16__offset12b_pos: @@ -12862,7 +12840,6 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_v2f16__offset12b_neg: @@ -13144,7 +13121,6 @@ define void @global_agent_atomic_fsub_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX12-NEXT: s_cbranch_execnz .LBB45_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_v2f16: @@ -13409,7 +13385,6 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: s_cbranch_execnz .LBB46_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_v2f16__offset12b_pos: @@ -13677,7 +13652,6 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX12-NEXT: s_cbranch_execnz .LBB47_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_v2f16__offset12b_neg: @@ -13956,7 +13930,6 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fsub_ret_v2f16__offset12b_pos: @@ -14237,7 +14210,6 @@ define void @global_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: s_cbranch_execnz .LBB49_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fsub_noret_v2f16__offset12b_pos: @@ -14514,9 +14486,10 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 @@ -14533,7 +14506,6 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_v2bf16: @@ -14913,9 +14885,10 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 @@ -14932,7 +14905,6 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_pos: @@ -15314,9 +15286,10 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 @@ -15333,7 +15306,6 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_neg: @@ -15718,8 +15690,10 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 @@ -15736,7 +15710,6 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX12-NEXT: s_cbranch_execnz .LBB53_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_v2bf16: @@ -16104,8 +16077,10 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 @@ -16122,7 +16097,6 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: s_cbranch_execnz .LBB54_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos: @@ -16493,8 +16467,10 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 @@ -16511,7 +16487,6 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX12-NEXT: s_cbranch_execnz .LBB55_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg: @@ -16891,9 +16866,10 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 @@ -16911,7 +16887,6 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fsub_ret_v2bf16__offset12b_pos: @@ -17294,8 +17269,10 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 @@ -17313,7 +17290,6 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX12-NEXT: s_cbranch_execnz .LBB57_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos: diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll index bfd57aebad521..a33d363e11bcf 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll @@ -537,6 +537,7 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX12-NEXT: v_mov_b32_e32 v9, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc ; GFX12-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc ; GFX12-NEXT: global_wb scope:SCOPE_SYS @@ -664,6 +665,7 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX12-NEXT: v_mov_b32_e32 v9, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc ; GFX12-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc ; GFX12-NEXT: global_wb scope:SCOPE_SYS @@ -779,6 +781,7 @@ define amdgpu_ps void @global_max_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc ; GFX12-NEXT: global_wb scope:SCOPE_SYS @@ -891,6 +894,7 @@ define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc ; GFX12-NEXT: global_wb scope:SCOPE_SYS @@ -1445,6 +1449,7 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX12-NEXT: v_mov_b32_e32 v9, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc ; GFX12-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc ; GFX12-NEXT: global_wb scope:SCOPE_SYS @@ -1572,6 +1577,7 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX12-NEXT: v_mov_b32_e32 v9, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc ; GFX12-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc ; GFX12-NEXT: global_wb scope:SCOPE_SYS @@ -1687,6 +1693,7 @@ define amdgpu_ps void @global_min_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc ; GFX12-NEXT: global_wb scope:SCOPE_SYS @@ -1799,6 +1806,7 @@ define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc ; GFX12-NEXT: global_wb scope:SCOPE_SYS @@ -2353,6 +2361,7 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(ptr addrspace(1) inreg % ; GFX12-NEXT: v_mov_b32_e32 v9, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc ; GFX12-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc ; GFX12-NEXT: global_wb scope:SCOPE_SYS @@ -2480,6 +2489,7 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(ptr addrspace(1) ; GFX12-NEXT: v_mov_b32_e32 v9, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc ; GFX12-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc ; GFX12-NEXT: global_wb scope:SCOPE_SYS @@ -2595,6 +2605,7 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn(ptr addrspace(1) inreg %sbase ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc ; GFX12-NEXT: global_wb scope:SCOPE_SYS @@ -2707,6 +2718,7 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc ; GFX12-NEXT: global_wb scope:SCOPE_SYS @@ -3261,6 +3273,7 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(ptr addrspace(1) inreg % ; GFX12-NEXT: v_mov_b32_e32 v9, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc ; GFX12-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc ; GFX12-NEXT: global_wb scope:SCOPE_SYS @@ -3388,6 +3401,7 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(ptr addrspace(1) ; GFX12-NEXT: v_mov_b32_e32 v9, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc ; GFX12-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc ; GFX12-NEXT: global_wb scope:SCOPE_SYS @@ -3503,6 +3517,7 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn(ptr addrspace(1) inreg %sbase ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc ; GFX12-NEXT: global_wb scope:SCOPE_SYS @@ -3615,6 +3630,7 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc ; GFX12-NEXT: global_wb scope:SCOPE_SYS diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll index b2f113f08a916..492a30b67089c 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll @@ -1679,6 +1679,7 @@ define amdgpu_ps float @global_load_saddr_uniform_ptr_in_vgprs(i32 %voffset) { ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v2, vcc ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: ; return to shader part epilog %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds %zext.offset = zext i32 %voffset to i64 @@ -1745,6 +1746,7 @@ define amdgpu_ps float @global_load_saddr_uniform_ptr_in_vgprs_immoffset(i32 %vo ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v2, vcc ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off offset:42 ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: ; return to shader part epilog %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds %zext.offset = zext i32 %voffset to i64 @@ -1931,6 +1933,7 @@ define amdgpu_ps float @global_load_i8_vgpr64_sgpr32(ptr addrspace(1) %vbase, i3 ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: global_load_i8_vgpr64_sgpr32: @@ -1943,6 +1946,7 @@ define amdgpu_ps float @global_load_i8_vgpr64_sgpr32(ptr addrspace(1) %vbase, i3 ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %soffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %vbase, i64 %zext.offset @@ -1986,6 +1990,7 @@ define amdgpu_ps float @global_load_i8_vgpr64_sgpr32_offset_4095(ptr addrspace(1 ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: global_load_i8_vgpr64_sgpr32_offset_4095: @@ -1998,6 +2003,7 @@ define amdgpu_ps float @global_load_i8_vgpr64_sgpr32_offset_4095(ptr addrspace(1 ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off offset:4095 ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %soffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %vbase, i64 %zext.offset @@ -2065,6 +2071,7 @@ define amdgpu_ps float @global_load_saddr_f32_natural_addressing(ptr addrspace(1 ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc ; GFX12-SDAG-NEXT: global_load_b32 v0, v[0:1], off ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: global_load_saddr_f32_natural_addressing: @@ -2081,6 +2088,7 @@ define amdgpu_ps float @global_load_saddr_f32_natural_addressing(ptr addrspace(1 ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc, v3, v1, vcc ; GFX12-GISEL-NEXT: global_load_b32 v0, v[0:1], off ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: ; return to shader part epilog %voffset = load i32, ptr addrspace(1) %voffset.ptr %zext.offset = zext i32 %voffset to i64 @@ -2246,6 +2254,7 @@ define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range_too_large(ptr addr ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc ; GFX12-SDAG-NEXT: global_load_b32 v0, v[0:1], off ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: global_load_f32_saddr_zext_vgpr_range_too_large: @@ -2262,6 +2271,7 @@ define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range_too_large(ptr addr ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc, v3, v1, vcc ; GFX12-GISEL-NEXT: global_load_b32 v0, v[0:1], off ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: ; return to shader part epilog %voffset = load i32, ptr addrspace(1) %voffset.ptr, !range !1, !noundef !{} %zext.offset = zext i32 %voffset to i64 @@ -4727,10 +4737,12 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) { ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-GISEL-NEXT: .LBB132_1: ; %bb3 ; GFX12-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX12-GISEL-NEXT: v_add_co_u32 v4, vcc, v0, v2 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc, v1, v3, vcc ; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc, v2, 4 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v3, vcc, 0, v3, vcc ; GFX12-GISEL-NEXT: global_load_b32 v4, v[4:5], off scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -4845,10 +4857,12 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-GISEL-NEXT: .LBB133_1: ; %bb3 ; GFX12-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX12-GISEL-NEXT: v_add_co_u32 v4, vcc, v0, v2 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc, v1, v3, vcc ; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc, v2, 4 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v3, vcc, 0, v3, vcc ; GFX12-GISEL-NEXT: global_load_b32 v6, v[4:5], off scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll index 3e6143866bf88..a15bf7f32dc27 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll @@ -46,19 +46,15 @@ define amdgpu_kernel void @indirect_call_known_no_special_inputs() { ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX12-NEXT: s_getpc_b64 s[6:7] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_sext_i32_i16 s7, s7 -; GFX12-NEXT: s_add_co_u32 s6, s6, snork@gotpcrel32@lo+12 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_add_co_ci_u32 s7, s7, snork@gotpcrel32@hi+24 +; GFX12-NEXT: s_add_co_u32 s6, s6, snork@gotpcrel32@lo+8 +; GFX12-NEXT: s_add_co_ci_u32 s7, s7, snork@gotpcrel32@hi+16 ; GFX12-NEXT: s_mov_b64 s[8:9], s[4:5] ; GFX12-NEXT: s_mov_b64 s[4:5], 0 ; GFX12-NEXT: s_getpc_b64 s[12:13] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_sext_i32_i16 s13, s13 -; GFX12-NEXT: s_add_co_u32 s12, s12, wobble@gotpcrel32@lo+12 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_add_co_ci_u32 s13, s13, wobble@gotpcrel32@hi+24 +; GFX12-NEXT: s_add_co_u32 s12, s12, wobble@gotpcrel32@lo+8 +; GFX12-NEXT: s_add_co_ci_u32 s13, s13, wobble@gotpcrel32@hi+16 ; GFX12-NEXT: s_load_u8 s14, s[4:5], 0x0 ; GFX12-NEXT: s_load_b64 s[4:5], s[6:7], 0x0 ; GFX12-NEXT: s_load_b64 s[6:7], s[12:13], 0x0 @@ -67,13 +63,12 @@ define amdgpu_kernel void @indirect_call_known_no_special_inputs() { ; GFX12-NEXT: s_mov_b32 s32, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_and_b32 s12, 1, s14 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_cmp_eq_u32 s12, 1 ; GFX12-NEXT: s_cselect_b32 s13, s7, s5 ; GFX12-NEXT: s_cselect_b32 s12, s6, s4 ; GFX12-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX12-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_swappc_b64 s[30:31], s[12:13] ; GFX12-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll index fbeda72725b2a..5256cbcef123a 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll @@ -289,7 +289,6 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw nand ptr addrspace(1) %ptr, i32 4 seq_cst ret i32 %result @@ -591,7 +590,6 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(TRANS32_DEP_1) ; GFX12-NEXT: v_rcp_iflag_f32_e32 v0, s4 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2) ; GFX12-NEXT: s_mul_f32 s4, s4, 0x4f7ffffe ; GFX12-NEXT: s_wait_alu 0xfffe @@ -802,7 +800,6 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; GFX12-NEXT: ; %bb.1: ; GFX12-NEXT: s_load_b32 s1, s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_bcnt1_i32_b32 s0, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mul_i32 s0, s0, 5 @@ -1050,7 +1047,6 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: ; %bb.1: ; GFX12-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mul_i32 s1, s1, 5 @@ -1060,12 +1056,12 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: .LBB7_2: -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1236,7 +1232,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12-NEXT: ; %bb.1: ; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mul_i32 s1, s1, 5 @@ -1245,7 +1240,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: .LBB8_2: -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll index edf900a50cd4b..c0de009e935e6 100644 --- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll @@ -5123,6 +5123,7 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX1200-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1200-SDAG-NEXT: v_add_co_u32 v4, vcc_lo, v0, 1 +; GFX1200-SDAG-NEXT: s_wait_alu 0xfffd ; GFX1200-SDAG-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo ; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1200-SDAG-NEXT: v_mul_lo_u32 v7, v4, v3 @@ -5131,9 +5132,10 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1200-SDAG-NEXT: v_add_co_u32 v4, vcc_lo, v0, v4 ; GFX1200-SDAG-NEXT: v_add3_u32 v1, v1, v7, v6 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1200-SDAG-NEXT: v_mul_lo_u32 v6, v4, v3 ; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[3:4], null, v4, v2, 0 +; GFX1200-SDAG-NEXT: s_wait_alu 0xfffd ; GFX1200-SDAG-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v5, vcc_lo ; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-SDAG-NEXT: v_mul_lo_u32 v2, v5, v2 @@ -5149,6 +5151,7 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX1200-SDAG-NEXT: v_mul_lo_u32 v4, v1, v3 ; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, v3, v[0:1] ; GFX1200-SDAG-NEXT: v_add3_u32 v1, v4, v1, v2 +; GFX1200-SDAG-NEXT: s_wait_alu 0xfffd ; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX1200-GISEL-LABEL: clpeak_imad_pat_i64: @@ -5159,6 +5162,7 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX1200-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1200-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, 1 +; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd ; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo ; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1200-GISEL-NEXT: v_mul_hi_u32 v0, v4, v2 @@ -5170,6 +5174,7 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1200-GISEL-NEXT: v_mov_b32_e32 v7, v0 ; GFX1200-GISEL-NEXT: v_mul_hi_u32 v0, v4, v2 +; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd ; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v7, v5, vcc_lo ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v3, v[0:1] @@ -5179,10 +5184,12 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v2, v[0:1] ; GFX1200-GISEL-NEXT: v_mul_hi_u32 v1, v3, v4 ; GFX1200-GISEL-NEXT: v_mul_lo_u32 v5, v3, v4 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1200-GISEL-NEXT: v_mov_b32_e32 v6, v0 +; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd ; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v7, vcc_lo ; GFX1200-GISEL-NEXT: v_add_co_u32 v7, vcc_lo, v3, 1 +; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd ; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v6, vcc_lo ; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[1:2], null, v3, v2, v[1:2] @@ -5193,6 +5200,7 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[1:2], null, v5, v6, v[2:3] ; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[1:2], null, v3, v7, v[1:2] +; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd ; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31] entry: %y18 = add i64 %x, 1 @@ -5969,8 +5977,10 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX1200-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1200-SDAG-NEXT: v_add_co_u32 v8, vcc_lo, v0, 1 +; GFX1200-SDAG-NEXT: s_wait_alu 0xfffd ; GFX1200-SDAG-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v1, vcc_lo ; GFX1200-SDAG-NEXT: v_add_co_u32 v10, vcc_lo, v2, 1 +; GFX1200-SDAG-NEXT: s_wait_alu 0xfffd ; GFX1200-SDAG-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, 0, v3, vcc_lo ; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX1200-SDAG-NEXT: v_mul_lo_u32 v12, v9, v4 @@ -5983,11 +5993,12 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX1200-SDAG-NEXT: v_add3_u32 v12, v1, v13, v12 ; GFX1200-SDAG-NEXT: v_add_co_u32 v1, vcc_lo, v0, v8 ; GFX1200-SDAG-NEXT: v_add3_u32 v13, v3, v15, v14 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1200-SDAG-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX1200-SDAG-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v12, v9, vcc_lo ; GFX1200-SDAG-NEXT: v_add_co_u32 v8, vcc_lo, v2, v10 +; GFX1200-SDAG-NEXT: s_wait_alu 0xfffd ; GFX1200-SDAG-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v13, v11, vcc_lo -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX1200-SDAG-NEXT: v_mul_lo_u32 v10, v3, v4 ; GFX1200-SDAG-NEXT: v_mul_lo_u32 v11, v1, v5 ; GFX1200-SDAG-NEXT: v_mul_lo_u32 v7, v8, v7 @@ -6019,6 +6030,7 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1200-SDAG-NEXT: v_add3_u32 v1, v9, v1, v4 ; GFX1200-SDAG-NEXT: v_add3_u32 v3, v10, v3, v6 +; GFX1200-SDAG-NEXT: s_wait_alu 0xfffd ; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX1200-GISEL-LABEL: clpeak_imad_pat_v2i64: @@ -6029,10 +6041,12 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX1200-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1200-GISEL-NEXT: v_add_co_u32 v8, vcc_lo, v0, 1 +; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd ; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v1, vcc_lo ; GFX1200-GISEL-NEXT: v_add_co_u32 v10, vcc_lo, v2, 1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1200-GISEL-NEXT: v_mul_hi_u32 v0, v8, v4 +; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd ; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, 0, v3, vcc_lo ; GFX1200-GISEL-NEXT: v_mul_hi_u32 v1, v10, v6 ; GFX1200-GISEL-NEXT: v_mul_lo_u32 v12, v8, v4 @@ -6043,9 +6057,10 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[1:2], null, v9, v4, v[2:3] ; GFX1200-GISEL-NEXT: v_mov_b32_e32 v14, v1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[2:3], null, v11, v6, v[0:1] ; GFX1200-GISEL-NEXT: v_add_co_u32 v3, vcc_lo, v12, v8 +; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd ; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v14, v9, vcc_lo ; GFX1200-GISEL-NEXT: v_add_co_u32 v10, vcc_lo, v13, v10 ; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) @@ -6054,12 +6069,13 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX1200-GISEL-NEXT: v_mul_lo_u32 v15, v3, v4 ; GFX1200-GISEL-NEXT: v_mul_hi_u32 v1, v10, v6 ; GFX1200-GISEL-NEXT: v_mul_lo_u32 v16, v10, v6 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd ; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v8, v11, vcc_lo ; GFX1200-GISEL-NEXT: v_add_co_u32 v12, vcc_lo, v12, 1 +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[2:3], null, v3, v5, v[0:1] +; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd ; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v14, vcc_lo -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[0:1], null, v10, v7, v[1:2] ; GFX1200-GISEL-NEXT: v_mul_hi_u32 v1, v15, v12 ; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) @@ -6072,16 +6088,18 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX1200-GISEL-NEXT: v_mul_hi_u32 v0, v16, v9 ; GFX1200-GISEL-NEXT: v_mul_lo_u32 v14, v16, v9 ; GFX1200-GISEL-NEXT: v_mov_b32_e32 v11, v3 +; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd ; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v8, vcc_lo ; GFX1200-GISEL-NEXT: v_mul_lo_u32 v8, v15, v12 ; GFX1200-GISEL-NEXT: v_add_co_u32 v10, vcc_lo, v15, 1 +; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd ; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, 0, v1, vcc_lo ; GFX1200-GISEL-NEXT: v_add_co_u32 v15, vcc_lo, v16, 1 +; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd ; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, 0, v11, vcc_lo -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-GISEL-NEXT: v_mul_hi_u32 v1, v8, v10 +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[5:6], null, v16, v7, v[0:1] -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1200-GISEL-NEXT: v_mul_hi_u32 v0, v14, v15 ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[6:7], null, v2, v12, v[4:5] ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[1:2], null, v8, v13, v[1:2] @@ -6093,6 +6111,7 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX1200-GISEL-NEXT: v_mul_lo_u32 v0, v8, v10 ; GFX1200-GISEL-NEXT: v_mul_lo_u32 v2, v14, v15 ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[3:4], null, v3, v15, v[4:5] +; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd ; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31] entry: %y18 = add <2 x i64> %x, @@ -9354,7 +9373,9 @@ define i64 @mul_u24_zext_add64(i32 %x, i32 %y, i64 %z) { ; GFX1200-NEXT: v_mul_u32_u24_e32 v0, v0, v1 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX1200-NEXT: s_wait_alu 0xfffd ; GFX1200-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo +; GFX1200-NEXT: s_wait_alu 0xfffd ; GFX1200-NEXT: s_setpc_b64 s[30:31] %mul = call i32 @llvm.amdgcn.mul.u24(i32 %x, i32 %y) %mul.zext = zext i32 %mul to i64 diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index a7cc4cfc6707f..893b9fa6fb40d 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -143,6 +143,7 @@ ; GCN-O0-NEXT: Insert required mode register values ; GCN-O0-NEXT: SI Final Branch Preparation ; GCN-O0-NEXT: Post RA hazard recognizer +; GCN-O0-NEXT: AMDGPU Insert waits for SGPR read hazards ; GCN-O0-NEXT: Branch relaxation pass ; GCN-O0-NEXT: AMDGPU Preload Kernel Arguments Prolog ; GCN-O0-NEXT: Register Usage Information Collector Pass @@ -426,6 +427,7 @@ ; GCN-O1-NEXT: SI Final Branch Preparation ; GCN-O1-NEXT: SI peephole optimizations ; GCN-O1-NEXT: Post RA hazard recognizer +; GCN-O1-NEXT: AMDGPU Insert waits for SGPR read hazards ; GCN-O1-NEXT: AMDGPU Insert Delay ALU ; GCN-O1-NEXT: Branch relaxation pass ; GCN-O1-NEXT: AMDGPU Preload Kernel Arguments Prolog @@ -738,6 +740,7 @@ ; GCN-O1-OPTS-NEXT: SI Final Branch Preparation ; GCN-O1-OPTS-NEXT: SI peephole optimizations ; GCN-O1-OPTS-NEXT: Post RA hazard recognizer +; GCN-O1-OPTS-NEXT: AMDGPU Insert waits for SGPR read hazards ; GCN-O1-OPTS-NEXT: AMDGPU Insert Delay ALU ; GCN-O1-OPTS-NEXT: Branch relaxation pass ; GCN-O1-OPTS-NEXT: AMDGPU Preload Kernel Arguments Prolog @@ -1056,6 +1059,7 @@ ; GCN-O2-NEXT: SI Final Branch Preparation ; GCN-O2-NEXT: SI peephole optimizations ; GCN-O2-NEXT: Post RA hazard recognizer +; GCN-O2-NEXT: AMDGPU Insert waits for SGPR read hazards ; GCN-O2-NEXT: AMDGPU Insert Delay ALU ; GCN-O2-NEXT: Branch relaxation pass ; GCN-O2-NEXT: AMDGPU Preload Kernel Arguments Prolog @@ -1387,6 +1391,7 @@ ; GCN-O3-NEXT: SI Final Branch Preparation ; GCN-O3-NEXT: SI peephole optimizations ; GCN-O3-NEXT: Post RA hazard recognizer +; GCN-O3-NEXT: AMDGPU Insert waits for SGPR read hazards ; GCN-O3-NEXT: AMDGPU Insert Delay ALU ; GCN-O3-NEXT: Branch relaxation pass ; GCN-O3-NEXT: AMDGPU Preload Kernel Arguments Prolog diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll index f86c9365d0b79..6fbd5ff80b5cd 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll @@ -64,7 +64,6 @@ define float @raw_buffer_atomic_cond_sub_imm_soff_return(<4 x i32> inreg %rsrc, ; GFX12-NEXT: s_mov_b32 s4, 4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] main_body: %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 4, i32 0) @@ -84,7 +83,6 @@ define void @raw_buffer_atomic_cond_sub_imm_soff_no_return(<4 x i32> inreg %rsrc ; GFX12-NEXT: s_mov_b32 s4, 4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] main_body: %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 4, i32 0) @@ -102,7 +100,6 @@ define void @raw_buffer_atomic_cond_sub_imm_soff_no_return_forced(<4 x i32> inre ; GFX12-NEXT: v_mov_b32_e32 v0, s16 ; GFX12-NEXT: s_mov_b32 s4, 4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] main_body: %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 4, i32 0) @@ -172,7 +169,6 @@ define float @struct_buffer_atomic_cond_sub_imm_soff_return(<4 x i32> inreg %rsr ; GFX12-NEXT: s_mov_b32 s4, 4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, v1, s[0:3], s4 idxen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] main_body: %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 4, i32 0) @@ -192,7 +188,6 @@ define void @struct_buffer_atomic_cond_sub_imm_soff_no_return(<4 x i32> inreg %r ; GFX12-NEXT: s_mov_b32 s4, 4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], s4 idxen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] main_body: %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 4, i32 0) @@ -210,7 +205,6 @@ define void @struct_buffer_atomic_cond_sub_imm_soff_no_return_forced(<4 x i32> i ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s16 ; GFX12-NEXT: s_mov_b32 s4, 4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], s4 idxen -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] main_body: %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 4, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll index 353f4d90cad1f..990a6066adcd3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll @@ -26,7 +26,6 @@ define amdgpu_cs_chain void @basic(<3 x i32> inreg %sgpr, ptr inreg %callee, i32 ; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GISEL12-NEXT: v_add_nc_u32_e32 v11, 32, v12 ; GISEL12-NEXT: s_mov_b32 exec_lo, s5 -; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: s_setpc_b64 s[6:7] ; ; DAGISEL12-LABEL: basic: @@ -50,7 +49,6 @@ define amdgpu_cs_chain void @basic(<3 x i32> inreg %sgpr, ptr inreg %callee, i32 ; DAGISEL12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; DAGISEL12-NEXT: v_add_nc_u32_e32 v11, 32, v12 ; DAGISEL12-NEXT: s_mov_b32 exec_lo, s5 -; DAGISEL12-NEXT: s_wait_alu 0xfffe ; DAGISEL12-NEXT: s_setpc_b64 s[6:7] ; ; GISEL10-LABEL: basic: @@ -123,8 +121,9 @@ define amdgpu_cs_chain void @wwm_in_shader(<3 x i32> inreg %sgpr, ptr inreg %cal ; GISEL12-NEXT: s_or_saveexec_b32 s4, -1 ; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: v_cndmask_b32_e64 v0, 0x47, v10, s4 -; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GISEL12-NEXT: v_cmp_ne_u32_e64 s8, 0, v0 +; GISEL12-NEXT: s_wait_alu 0xf1ff ; GISEL12-NEXT: v_mov_b32_e32 v0, s8 ; GISEL12-NEXT: s_mov_b32 exec_lo, s4 ; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -132,7 +131,6 @@ define amdgpu_cs_chain void @wwm_in_shader(<3 x i32> inreg %sgpr, ptr inreg %cal ; GISEL12-NEXT: ; %bb.2: ; %tail ; GISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GISEL12-NEXT: s_mov_b32 exec_lo, s5 -; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: s_setpc_b64 s[6:7] ; ; DAGISEL12-LABEL: wwm_in_shader: @@ -159,7 +157,6 @@ define amdgpu_cs_chain void @wwm_in_shader(<3 x i32> inreg %sgpr, ptr inreg %cal ; DAGISEL12-NEXT: ; %bb.2: ; %tail ; DAGISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; DAGISEL12-NEXT: s_mov_b32 exec_lo, s5 -; DAGISEL12-NEXT: s_wait_alu 0xfffe ; DAGISEL12-NEXT: s_setpc_b64 s[6:7] ; ; GISEL10-LABEL: wwm_in_shader: @@ -244,8 +241,9 @@ define amdgpu_cs_chain void @phi_whole_struct(<3 x i32> inreg %sgpr, ptr inreg % ; GISEL12-NEXT: s_or_saveexec_b32 s4, -1 ; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: v_cndmask_b32_e64 v0, 0x47, v12, s4 -; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GISEL12-NEXT: v_cmp_ne_u32_e64 s8, 0, v0 +; GISEL12-NEXT: s_wait_alu 0xf1ff ; GISEL12-NEXT: v_mov_b32_e32 v0, s8 ; GISEL12-NEXT: s_mov_b32 exec_lo, s4 ; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -253,7 +251,6 @@ define amdgpu_cs_chain void @phi_whole_struct(<3 x i32> inreg %sgpr, ptr inreg % ; GISEL12-NEXT: ; %bb.2: ; %tail ; GISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GISEL12-NEXT: s_mov_b32 exec_lo, s5 -; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: s_setpc_b64 s[6:7] ; ; DAGISEL12-LABEL: phi_whole_struct: @@ -279,7 +276,6 @@ define amdgpu_cs_chain void @phi_whole_struct(<3 x i32> inreg %sgpr, ptr inreg % ; DAGISEL12-NEXT: ; %bb.2: ; %tail ; DAGISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; DAGISEL12-NEXT: s_mov_b32 exec_lo, s5 -; DAGISEL12-NEXT: s_wait_alu 0xfffe ; DAGISEL12-NEXT: s_setpc_b64 s[6:7] ; ; GISEL10-LABEL: phi_whole_struct: @@ -367,8 +363,9 @@ define amdgpu_cs_chain void @control_flow(<3 x i32> inreg %sgpr, ptr inreg %call ; GISEL12-NEXT: s_or_saveexec_b32 s8, -1 ; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: v_cndmask_b32_e64 v0, 0x47, v1, s8 -; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GISEL12-NEXT: v_cmp_ne_u32_e64 s9, 0, v0 +; GISEL12-NEXT: s_wait_alu 0xf1ff ; GISEL12-NEXT: v_mov_b32_e32 v0, s9 ; GISEL12-NEXT: s_mov_b32 exec_lo, s8 ; GISEL12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v13, v1 @@ -406,7 +403,6 @@ define amdgpu_cs_chain void @control_flow(<3 x i32> inreg %sgpr, ptr inreg %call ; GISEL12-NEXT: ; %bb.8: ; %tail.end ; GISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GISEL12-NEXT: s_mov_b32 exec_lo, s5 -; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: s_setpc_b64 s[6:7] ; ; DAGISEL12-LABEL: control_flow: @@ -465,7 +461,6 @@ define amdgpu_cs_chain void @control_flow(<3 x i32> inreg %sgpr, ptr inreg %call ; DAGISEL12-NEXT: s_wait_alu 0xfffe ; DAGISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; DAGISEL12-NEXT: s_mov_b32 exec_lo, s5 -; DAGISEL12-NEXT: s_wait_alu 0xfffe ; DAGISEL12-NEXT: s_setpc_b64 s[6:7] ; ; GISEL10-LABEL: control_flow: @@ -619,8 +614,9 @@ define amdgpu_cs_chain void @use_v0_7(<3 x i32> inreg %sgpr, ptr inreg %callee, ; GISEL12-NEXT: s_or_saveexec_b32 s4, -1 ; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: v_cndmask_b32_e64 v13, 0x47, v12, s4 -; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GISEL12-NEXT: v_cmp_ne_u32_e64 s8, 0, v13 +; GISEL12-NEXT: s_wait_alu 0xf1ff ; GISEL12-NEXT: v_mov_b32_e32 v13, s8 ; GISEL12-NEXT: s_mov_b32 exec_lo, s4 ; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -632,7 +628,6 @@ define amdgpu_cs_chain void @use_v0_7(<3 x i32> inreg %sgpr, ptr inreg %callee, ; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GISEL12-NEXT: s_mov_b32 exec_lo, s5 -; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: s_setpc_b64 s[6:7] ; ; DAGISEL12-LABEL: use_v0_7: @@ -663,7 +658,6 @@ define amdgpu_cs_chain void @use_v0_7(<3 x i32> inreg %sgpr, ptr inreg %callee, ; DAGISEL12-NEXT: s_wait_alu 0xfffe ; DAGISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; DAGISEL12-NEXT: s_mov_b32 exec_lo, s5 -; DAGISEL12-NEXT: s_wait_alu 0xfffe ; DAGISEL12-NEXT: s_setpc_b64 s[6:7] ; ; GISEL10-LABEL: use_v0_7: @@ -786,7 +780,6 @@ define amdgpu_cs_chain void @wwm_write_to_arg_reg(<3 x i32> inreg %sgpr, ptr inr ; GISEL12-NEXT: v_dual_mov_b32 v12, v36 :: v_dual_mov_b32 v13, v37 ; GISEL12-NEXT: v_dual_mov_b32 v14, v38 :: v_dual_mov_b32 v15, v39 ; GISEL12-NEXT: s_wait_kmcnt 0x0 -; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GISEL12-NEXT: v_dual_mov_b32 v24, v0 :: v_dual_mov_b32 v25, v1 ; GISEL12-NEXT: v_dual_mov_b32 v26, v2 :: v_dual_mov_b32 v27, v3 @@ -861,7 +854,6 @@ define amdgpu_cs_chain void @wwm_write_to_arg_reg(<3 x i32> inreg %sgpr, ptr inr ; DAGISEL12-NEXT: v_dual_mov_b32 v12, v36 :: v_dual_mov_b32 v13, v37 ; DAGISEL12-NEXT: v_dual_mov_b32 v14, v38 :: v_dual_mov_b32 v15, v39 ; DAGISEL12-NEXT: s_wait_kmcnt 0x0 -; DAGISEL12-NEXT: s_wait_alu 0xfffe ; DAGISEL12-NEXT: s_swappc_b64 s[30:31], s[0:1] ; DAGISEL12-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 ; DAGISEL12-NEXT: v_dual_mov_b32 v42, v2 :: v_dual_mov_b32 v43, v3 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll index 1b1c89d9f5ad2..e0a5d397bded4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll @@ -26,8 +26,9 @@ define amdgpu_cs_chain void @basic(<3 x i32> inreg %sgpr, ptr inreg %callee, i64 ; GISEL12-NEXT: s_or_saveexec_b64 s[10:11], -1 ; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: v_cndmask_b32_e64 v0, 0x47, v13, s[10:11] -; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GISEL12-NEXT: v_cmp_ne_u32_e64 s[12:13], 0, v0 +; GISEL12-NEXT: s_wait_alu 0xf1ff ; GISEL12-NEXT: v_mov_b32_e32 v0, s12 ; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GISEL12-NEXT: v_mov_b32_e32 v1, s13 @@ -40,7 +41,6 @@ define amdgpu_cs_chain void @basic(<3 x i32> inreg %sgpr, ptr inreg %callee, i64 ; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: s_or_b64 exec, exec, s[6:7] ; GISEL12-NEXT: s_mov_b64 exec, s[4:5] -; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: s_setpc_b64 s[8:9] ; ; DAGISEL12-LABEL: basic: @@ -71,7 +71,6 @@ define amdgpu_cs_chain void @basic(<3 x i32> inreg %sgpr, ptr inreg %callee, i64 ; DAGISEL12-NEXT: ; %bb.2: ; %tail ; DAGISEL12-NEXT: s_or_b64 exec, exec, s[8:9] ; DAGISEL12-NEXT: s_mov_b64 exec, s[6:7] -; DAGISEL12-NEXT: s_wait_alu 0xfffe ; DAGISEL12-NEXT: s_setpc_b64 s[4:5] ; ; GISEL10-LABEL: basic: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll index e592a4ac5e8fa..8af5db9f62908 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll @@ -352,9 +352,10 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX12-SDAG-NEXT: v_mov_b32_e32 v4, 4.0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, s0, v2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 ; GFX12-SDAG-NEXT: v_add_co_u32 v2, s0, s2, v2 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0 ; GFX12-SDAG-NEXT: flat_load_b32 v9, v[0:1] ; GFX12-SDAG-NEXT: flat_load_b32 v10, v[2:3] @@ -391,6 +392,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX12-GISEL-NEXT: flat_load_b32 v9, v[0:1] ; GFX12-GISEL-NEXT: flat_load_b32 v10, v[2:3] @@ -503,9 +505,10 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, s0, v2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 ; GFX12-SDAG-NEXT: v_add_co_u32 v2, s0, s2, v2 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0 ; GFX12-SDAG-NEXT: flat_load_b32 v6, v[0:1] ; GFX12-SDAG-NEXT: flat_load_b32 v7, v[2:3] @@ -537,6 +540,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX12-GISEL-NEXT: flat_load_b32 v6, v[0:1] ; GFX12-GISEL-NEXT: flat_load_b32 v7, v[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll index 5eb6d203098ee..076cf09678b57 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll @@ -840,6 +840,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 % ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] @@ -850,12 +851,14 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 % ; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_endpgm @@ -1015,6 +1018,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1025,12 +1029,14 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float ; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_endpgm @@ -1167,9 +1173,10 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i32(ptr addrspace(1) %out, i32 % ; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-SDAG-NEXT: s_endpgm @@ -1316,9 +1323,10 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f32(ptr addrspace(1) %out, float ; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-SDAG-NEXT: s_endpgm @@ -1468,9 +1476,10 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i32(ptr addrspace(1) %out, i32 % ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-SDAG-NEXT: s_endpgm @@ -1655,9 +1664,10 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f32(ptr addrspace(1) %out, float ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-SDAG-NEXT: s_endpgm @@ -3395,6 +3405,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i32(ptr addrspace(1) %out, i32 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] @@ -3405,12 +3416,14 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i32(ptr addrspace(1) %out, i32 ; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_endpgm @@ -3497,6 +3510,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, floa ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] @@ -3507,12 +3521,14 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, floa ; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_endpgm @@ -3722,9 +3738,10 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32 ; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-SDAG-NEXT: s_endpgm @@ -3799,9 +3816,10 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_f32(ptr addrspace(1) %out, floa ; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-SDAG-NEXT: s_endpgm @@ -4023,9 +4041,10 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-SDAG-NEXT: s_endpgm @@ -4104,9 +4123,10 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f32(ptr addrspace(1) %out, floa ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-SDAG-NEXT: s_endpgm @@ -8550,6 +8570,7 @@ define void @v_permlane16_half(ptr addrspace(1) %out, half %src0, i32 %src1, i32 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-NEXT: global_store_b16 v[0:1], v2, off @@ -8588,6 +8609,7 @@ define void @v_permlanex16_half(ptr addrspace(1) %out, half %src0, i32 %src1, i3 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-NEXT: global_store_b16 v[0:1], v2, off @@ -8626,6 +8648,7 @@ define void @v_permlane16_bfloat(ptr addrspace(1) %out, bfloat %src0, i32 %src1, ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-NEXT: global_store_b16 v[0:1], v2, off @@ -8664,6 +8687,7 @@ define void @v_permlanex16_bfloat(ptr addrspace(1) %out, bfloat %src0, i32 %src1 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-NEXT: global_store_b16 v[0:1], v2, off @@ -8702,6 +8726,7 @@ define void @v_permlane16_i16(ptr addrspace(1) %out, i16 %src0, i32 %src1, i32 % ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-NEXT: global_store_b16 v[0:1], v2, off @@ -8740,6 +8765,7 @@ define void @v_permlanex16_i16(ptr addrspace(1) %out, i16 %src0, i32 %src1, i32 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-NEXT: global_store_b16 v[0:1], v2, off @@ -8778,6 +8804,7 @@ define void @v_permlane16_v2f16(ptr addrspace(1) %out, <2 x half> %src0, i32 %sr ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-NEXT: global_store_b32 v[0:1], v2, off @@ -8816,6 +8843,7 @@ define void @v_permlanex16_v2f16(ptr addrspace(1) %out, <2 x half> %src0, i32 %s ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-NEXT: global_store_b32 v[0:1], v2, off @@ -8877,6 +8905,7 @@ define void @v_permlane16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 %s ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v4 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v5 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 @@ -8892,6 +8921,7 @@ define void @v_permlane16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 %s ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v4 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v5 +; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 @@ -8954,6 +8984,7 @@ define void @v_permlanex16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 % ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v4 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v5 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 @@ -8969,6 +9000,7 @@ define void @v_permlanex16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 % ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v4 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v5 +; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 @@ -9057,6 +9089,7 @@ define void @v_permlane16_v7i32(ptr addrspace(1) %out, <7 x i32> %src0, i32 %src ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v9 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v10 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v8, v8, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1 @@ -9079,6 +9112,7 @@ define void @v_permlane16_v7i32(ptr addrspace(1) %out, <7 x i32> %src0, i32 %src ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v9 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v10 +; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 @@ -9174,6 +9208,7 @@ define void @v_permlanex16_v7i32(ptr addrspace(1) %out, <7 x i32> %src0, i32 %sr ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v9 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v10 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v8, v8, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v7, v7, s0, s1 @@ -9196,6 +9231,7 @@ define void @v_permlanex16_v7i32(ptr addrspace(1) %out, <7 x i32> %src0, i32 %sr ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v9 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v10 +; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 @@ -9273,6 +9309,7 @@ define void @v_permlane16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %src ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v6 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v7 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 @@ -9290,6 +9327,7 @@ define void @v_permlane16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %src ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v6 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v7 +; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 @@ -9362,6 +9400,7 @@ define void @v_permlanex16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %sr ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v6 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v7 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 @@ -9379,6 +9418,7 @@ define void @v_permlanex16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %sr ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v6 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v7 +; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ptr.ll index bb42834221681..10c000095fe3d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ptr.ll @@ -34,6 +34,7 @@ define void @v_permlane16_p0(ptr addrspace(1) %out, ptr %src0, i32 %src1, i32 %s ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v4 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v5 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 @@ -75,6 +76,7 @@ define void @v_permlanex16_p0(ptr addrspace(1) %out, ptr %src0, i32 %src1, i32 % ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v4 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v5 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 @@ -127,6 +129,7 @@ define void @v_permlane16_v3p0(ptr addrspace(1) %out, <3 x ptr> %src0, i32 %src1 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v8 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v9 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1 @@ -185,6 +188,7 @@ define void @v_permlanex16_v3p0(ptr addrspace(1) %out, <3 x ptr> %src0, i32 %src ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v8 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v9 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v7, v7, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v6, v6, s0, s1 @@ -230,6 +234,7 @@ define void @v_permlane16_p3(ptr addrspace(1) %out, ptr addrspace(3) %src0, i32 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off @@ -268,6 +273,7 @@ define void @v_permlanex16_p3(ptr addrspace(1) %out, ptr addrspace(3) %src0, i32 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off @@ -310,6 +316,7 @@ define void @v_permlane16_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %sr ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v5 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v6 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 @@ -354,6 +361,7 @@ define void @v_permlanex16_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %s ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v5 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v6 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 @@ -394,6 +402,7 @@ define void @v_permlane16_p5(ptr addrspace(1) %out, ptr addrspace(5) %src0, i32 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off @@ -432,6 +441,7 @@ define void @v_permlanex16_p5(ptr addrspace(1) %out, ptr addrspace(5) %src0, i32 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off @@ -474,6 +484,7 @@ define void @v_permlane16_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> %sr ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v5 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v6 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 @@ -518,6 +529,7 @@ define void @v_permlanex16_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> %s ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v5 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v6 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 @@ -558,6 +570,7 @@ define void @v_permlane16_p6(ptr addrspace(1) %out, ptr addrspace(6) %src0, i32 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off @@ -596,6 +609,7 @@ define void @v_permlanex16_p6(ptr addrspace(1) %out, ptr addrspace(6) %src0, i32 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off @@ -638,6 +652,7 @@ define void @v_permlane16_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> %sr ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v5 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v6 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 @@ -682,6 +697,7 @@ define void @v_permlanex16_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> %s ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v5 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v6 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll index 22a473e44b273..08d2201036c77 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll @@ -78,15 +78,14 @@ define <2 x bfloat> @raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__vgpr_rsrc__ ; GFX12-NEXT: v_readfirstlane_b32 s6, v3 ; GFX12-NEXT: v_readfirstlane_b32 s7, v4 ; GFX12-NEXT: v_readfirstlane_b32 s3, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u32_e64 s1, s3, v6 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, s0, s1 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 @@ -95,13 +94,11 @@ define <2 x bfloat> @raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__vgpr_rsrc__ ; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX12-NEXT: ; implicit-def: $vgpr6 ; GFX12-NEXT: ; implicit-def: $vgpr5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB4_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 128 %ret = call <2 x bfloat> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll index 0522d5258b9b5..fed7a8ec105fd 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll @@ -122,6 +122,7 @@ define amdgpu_gs void @main(<4 x i32> %arg, i32 %arg1) { ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll index 2698ce1dc3fe3..a2b9c869c9c9a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll @@ -47,15 +47,14 @@ define <2 x bfloat> @struct_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__vgpr_rsr ; GFX1200-NEXT: v_readfirstlane_b32 s6, v3 ; GFX1200-NEXT: v_readfirstlane_b32 s7, v4 ; GFX1200-NEXT: v_readfirstlane_b32 s3, v7 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1200-NEXT: s_wait_alu 0xf1ff ; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7 -; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1200-NEXT: s_wait_alu 0xfffe -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-NEXT: s_and_b32 s0, s0, s1 ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_and_saveexec_b32 s0, s0 @@ -64,13 +63,11 @@ define <2 x bfloat> @struct_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__vgpr_rsr ; GFX1200-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX1200-NEXT: ; implicit-def: $vgpr7 ; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1200-NEXT: s_cbranch_execnz .LBB2_1 ; GFX1200-NEXT: ; %bb.2: ; GFX1200-NEXT: s_mov_b32 exec_lo, s2 ; GFX1200-NEXT: s_wait_loadcnt 0x0 -; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x bfloat> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret <2 x bfloat> %ret @@ -91,15 +88,14 @@ define void @struct_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__vgpr_rsrc__vgp ; GFX1200-NEXT: v_readfirstlane_b32 s6, v3 ; GFX1200-NEXT: v_readfirstlane_b32 s7, v4 ; GFX1200-NEXT: v_readfirstlane_b32 s3, v7 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1200-NEXT: s_wait_alu 0xf1ff ; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7 -; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1200-NEXT: s_wait_alu 0xfffe -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-NEXT: s_and_b32 s0, s0, s1 ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_and_saveexec_b32 s0, s0 @@ -108,12 +104,10 @@ define void @struct_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__vgpr_rsrc__vgp ; GFX1200-NEXT: ; implicit-def: $vgpr7 ; GFX1200-NEXT: ; implicit-def: $vgpr0 ; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1200-NEXT: s_cbranch_execnz .LBB3_1 ; GFX1200-NEXT: ; %bb.2: ; GFX1200-NEXT: s_mov_b32 exec_lo, s2 -; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x bfloat> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll index 6e94d4fe9fa27..91217c219c451 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll @@ -271,15 +271,14 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__vgpr_rsrc__vgpr_v ; GFX1200-NEXT: v_readfirstlane_b32 s6, v3 ; GFX1200-NEXT: v_readfirstlane_b32 s7, v4 ; GFX1200-NEXT: v_readfirstlane_b32 s3, v7 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1200-NEXT: s_wait_alu 0xf1ff ; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7 -; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1200-NEXT: s_wait_alu 0xfffe -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-NEXT: s_and_b32 s0, s0, s1 ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_and_saveexec_b32 s0, s0 @@ -288,12 +287,10 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__vgpr_rsrc__vgpr_v ; GFX1200-NEXT: ; implicit-def: $vgpr7 ; GFX1200-NEXT: ; implicit-def: $vgpr0 ; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1200-NEXT: s_cbranch_execnz .LBB4_1 ; GFX1200-NEXT: ; %bb.2: ; GFX1200-NEXT: s_mov_b32 exec_lo, s2 -; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void @@ -413,15 +410,14 @@ define void @struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__vgpr_rsrc__vgpr ; GFX1200-NEXT: v_readfirstlane_b32 s6, v3 ; GFX1200-NEXT: v_readfirstlane_b32 s7, v4 ; GFX1200-NEXT: v_readfirstlane_b32 s3, v7 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1200-NEXT: s_wait_alu 0xf1ff ; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7 -; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1200-NEXT: s_wait_alu 0xfffe -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-NEXT: s_and_b32 s0, s0, s1 ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_and_saveexec_b32 s0, s0 @@ -430,12 +426,10 @@ define void @struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__vgpr_rsrc__vgpr ; GFX1200-NEXT: ; implicit-def: $vgpr7 ; GFX1200-NEXT: ; implicit-def: $vgpr0 ; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1200-NEXT: s_cbranch_execnz .LBB5_1 ; GFX1200-NEXT: ; %bb.2: ; GFX1200-NEXT: s_mov_b32 exec_lo, s2 -; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll index 9d8572493b456..80fd1e05477f1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll @@ -216,15 +216,14 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_vo ; GFX1200-NEXT: v_readfirstlane_b32 s6, v3 ; GFX1200-NEXT: v_readfirstlane_b32 s7, v4 ; GFX1200-NEXT: v_readfirstlane_b32 s3, v7 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1200-NEXT: s_wait_alu 0xf1ff ; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7 -; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1200-NEXT: s_wait_alu 0xfffe -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-NEXT: s_and_b32 s0, s0, s1 ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_and_saveexec_b32 s0, s0 @@ -233,13 +232,11 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_vo ; GFX1200-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX1200-NEXT: ; implicit-def: $vgpr7 ; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1200-NEXT: s_cbranch_execnz .LBB4_1 ; GFX1200-NEXT: ; %bb.2: ; GFX1200-NEXT: s_mov_b32 exec_lo, s2 ; GFX1200-NEXT: s_wait_loadcnt 0x0 -; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret float %ret @@ -329,15 +326,14 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__ ; GFX1200-NEXT: v_readfirstlane_b32 s6, v3 ; GFX1200-NEXT: v_readfirstlane_b32 s7, v4 ; GFX1200-NEXT: v_readfirstlane_b32 s3, v7 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1200-NEXT: s_wait_alu 0xf1ff ; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7 -; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1200-NEXT: s_wait_alu 0xfffe -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-NEXT: s_and_b32 s0, s0, s1 ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_and_saveexec_b32 s0, s0 @@ -346,13 +342,11 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__ ; GFX1200-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX1200-NEXT: ; implicit-def: $vgpr7 ; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1200-NEXT: s_cbranch_execnz .LBB5_1 ; GFX1200-NEXT: ; %bb.2: ; GFX1200-NEXT: s_mov_b32 exec_lo, s2 ; GFX1200-NEXT: s_wait_loadcnt 0x0 -; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret <2 x half> %ret diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll index 6da16f0a3b053..13bb72a96142f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll @@ -455,10 +455,10 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_vo ; GFX12-NEXT: v_readfirstlane_b32 s5, v2 ; GFX12-NEXT: v_readfirstlane_b32 s6, v3 ; GFX12-NEXT: v_readfirstlane_b32 s7, v4 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] ; GFX12-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[3:4] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -467,13 +467,11 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_vo ; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[5:6], s[4:7], s0 idxen offen offset:256 th:TH_ATOMIC_RETURN ; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX12-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB8_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0) @@ -613,15 +611,14 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX12-NEXT: v_readfirstlane_b32 s6, v3 ; GFX12-NEXT: v_readfirstlane_b32 s7, v4 ; GFX12-NEXT: v_readfirstlane_b32 s3, v7 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u32_e64 s1, s3, v7 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, s0, s1 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 @@ -630,13 +627,11 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX12-NEXT: ; implicit-def: $vgpr7 ; GFX12-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB9_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll index aa41ef024d6e0..e75dd7409d51b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll @@ -455,10 +455,10 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_v ; GFX12-NEXT: v_readfirstlane_b32 s5, v2 ; GFX12-NEXT: v_readfirstlane_b32 s6, v3 ; GFX12-NEXT: v_readfirstlane_b32 s7, v4 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] ; GFX12-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[3:4] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -467,13 +467,11 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_v ; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[5:6], s[4:7], s0 idxen offen offset:256 th:TH_ATOMIC_RETURN ; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX12-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB8_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0) @@ -613,15 +611,14 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX12-NEXT: v_readfirstlane_b32 s6, v3 ; GFX12-NEXT: v_readfirstlane_b32 s7, v4 ; GFX12-NEXT: v_readfirstlane_b32 s3, v7 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u32_e64 s1, s3, v7 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, s0, s1 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 @@ -630,13 +627,11 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX12-NEXT: ; implicit-def: $vgpr7 ; GFX12-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB9_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll index d46622ef45f43..94c2e518a9fd3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll @@ -115,17 +115,22 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_mov_b32_e32 v4, v1 ; GFX12-NEXT: v_add3_u32 v1, v1, v6, v8 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v7, vcc_lo ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, v4, v8 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v5, v9, vcc_lo +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] bb: @@ -296,32 +301,40 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_mov_b32_e32 v12, v1 ; GFX12-NEXT: v_add3_u32 v1, v1, v6, v8 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_add_co_u32 v12, vcc_lo, v12, v6 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo ; GFX12-NEXT: v_add_co_u32 v12, vcc_lo, v12, v8 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v9, vcc_lo +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v11, vcc_lo ; GFX12-NEXT: v_add_co_u32 v7, vcc_lo, v7, v10 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v9, vcc_lo ; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v7, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v9, vcc_lo ; GFX12-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v5 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc_lo ; GFX12-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_sub_co_u32 v4, vcc_lo, v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo ; GFX12-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v5, v7 ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[2:3] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] bb: @@ -771,6 +784,7 @@ define { i64, i1 } @smulo_i64_v_4(i64 %i) { ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[5:6], v[0:1] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] bb: @@ -843,6 +857,7 @@ define { i64, i1 } @umulo_i64_v_4(i64 %i) { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[6:7], v[0:1] ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] bb: diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll index 223870950e4b7..fbda0e71a74c6 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll @@ -6026,7 +6026,6 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 @@ -8138,12 +8137,11 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_bfe_u32 s4, s3, 0x10014 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 ; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10015 ; GFX12-NEXT: s_lshr_b32 s4, s3, 31 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, v1 ; GFX12-NEXT: s_bfe_u32 s5, s3, 0x1001e ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:416 @@ -8231,8 +8229,8 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: s_bfe_u32 s4, s3, 0x10001 ; GFX12-NEXT: s_and_b32 s3, s3, 1 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:272 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-NEXT: s_lshr_b32 s3, s2, 31 ; GFX12-NEXT: s_bfe_u32 s4, s2, 0x1001e @@ -9475,14 +9473,12 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: s_lshr_b32 s94, s13, 29 ; GFX12-NEXT: s_lshr_b32 s78, s13, 26 ; GFX12-NEXT: s_lshr_b32 s88, s13, 27 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_bfe_i64 s[96:97], s[96:97], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[100:101], s[98:99], 0x10000 ; GFX12-NEXT: s_lshr_b32 s66, s13, 24 ; GFX12-NEXT: s_lshr_b32 s74, s13, 25 ; GFX12-NEXT: s_bfe_i64 s[92:93], s[92:93], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[94:95], s[94:95], 0x10000 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s96 ; GFX12-NEXT: s_lshr_b32 s56, s13, 22 ; GFX12-NEXT: s_lshr_b32 s62, s13, 23 @@ -9499,21 +9495,18 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: s_lshr_b32 s2, s13, 14 ; GFX12-NEXT: s_lshr_b32 s4, s13, 15 ; GFX12-NEXT: v_dual_mov_b32 v6, s93 :: v_dual_mov_b32 v7, s94 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v8, s95 :: v_dual_mov_b32 v9, s78 ; GFX12-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[74:75], s[74:75], 0x10000 ; GFX12-NEXT: s_lshr_b32 s6, s13, 12 ; GFX12-NEXT: s_lshr_b32 s8, s13, 13 ; GFX12-NEXT: v_dual_mov_b32 v10, s79 :: v_dual_mov_b32 v11, s88 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v12, s89 :: v_dual_mov_b32 v13, s66 ; GFX12-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x10000 ; GFX12-NEXT: s_lshr_b32 s10, s13, 10 ; GFX12-NEXT: s_lshr_b32 s14, s13, 11 ; GFX12-NEXT: v_dual_mov_b32 v14, s67 :: v_dual_mov_b32 v15, s74 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v16, s75 :: v_dual_mov_b32 v17, s56 ; GFX12-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 @@ -9526,7 +9519,6 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: s_lshr_b32 s16, s13, 8 ; GFX12-NEXT: s_lshr_b32 s20, s13, 9 ; GFX12-NEXT: v_dual_mov_b32 v18, s57 :: v_dual_mov_b32 v19, s62 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v20, s63 :: v_dual_mov_b32 v21, s44 ; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 @@ -9658,6 +9650,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: s_lshr_b32 s44, s12, 4 ; GFX12-NEXT: s_lshr_b32 s30, s12, 2 ; GFX12-NEXT: s_bfe_i64 s[18:19], s[12:13], 0x10000 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_bfe_i64 s[12:13], s[26:27], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[26:27], s[40:41], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[40:41], s[52:53], 0x10000 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index 255a1acbe0086..2afac4e90aa40 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -8875,7 +8875,6 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GFX12-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x100000 ; GFX12-NEXT: v_dual_mov_b32 v4, s64 :: v_dual_mov_b32 v7, s61 ; GFX12-NEXT: v_dual_mov_b32 v6, s60 :: v_dual_mov_b32 v9, s13 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v8, s12 :: v_dual_mov_b32 v11, s59 ; GFX12-NEXT: v_dual_mov_b32 v10, s58 :: v_dual_mov_b32 v13, s57 ; GFX12-NEXT: v_dual_mov_b32 v12, s56 :: v_dual_mov_b32 v15, s55 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll index efc31fbd5ed9e..b945c7c3def6a 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll @@ -8756,7 +8756,6 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s37 ; GFX12-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v3, s75 ; GFX12-NEXT: v_dual_mov_b32 v2, s74 :: v_dual_mov_b32 v5, s41 ; GFX12-NEXT: s_lshr_b32 s48, s5, 16 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll index e4602f20f8a37..5fd6deff0fbbb 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll @@ -505,7 +505,6 @@ define double @local_atomic_fadd_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_cbranch_execnz .LBB4_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_ret_f64: @@ -697,7 +696,6 @@ define double @local_atomic_fadd_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_cbranch_execnz .LBB5_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_ret_f64__offset: @@ -888,7 +886,6 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_cbranch_execnz .LBB6_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_noret_f64: @@ -1071,7 +1068,6 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_cbranch_execnz .LBB7_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_noret_f64__offset: @@ -1274,7 +1270,6 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v2 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_ret_f16: @@ -1581,7 +1576,6 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_ret_f16__offset: @@ -1895,7 +1889,6 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_noret_f16: @@ -2190,7 +2183,6 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_noret_f16__offset: @@ -2485,7 +2477,6 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_ret_f16__offset__align4: @@ -2721,7 +2712,6 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_noret_f16__offset__align4: @@ -2950,8 +2940,9 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -2971,7 +2962,6 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_ret_bf16: @@ -3305,8 +3295,9 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -3326,7 +3317,6 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_ret_bf16__offset: @@ -3667,8 +3657,9 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 @@ -3688,7 +3679,6 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_noret_bf16: @@ -4012,6 +4002,7 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 @@ -4031,7 +4022,6 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_noret_bf16__offset: @@ -4356,6 +4346,7 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX12-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -4374,7 +4365,6 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_ret_bf16__offset__align4: @@ -4646,8 +4636,9 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: v_bfe_u32 v3, v2, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -4665,7 +4656,6 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: s_cbranch_execnz .LBB19_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_noret_bf16__offset__align4: @@ -7030,7 +7020,6 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX12-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX12-NEXT: s_cbranch_execz .LBB28_2 ; GFX12-NEXT: ; %bb.1: -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_bcnt1_i32_b32 s3, s6 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s3 @@ -7106,6 +7095,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s2, v1 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_f32 v0, s2, v0 ; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo @@ -7897,7 +7887,6 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX12-NEXT: s_cbranch_execz .LBB29_2 ; GFX12-NEXT: ; %bb.1: -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_bcnt1_i32_b32 s3, s6 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s3 @@ -7969,6 +7958,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s2, v1 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_f32 v0, s2, v0 ; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll index d419b0cdfdd1a..1e8072460c7a3 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll @@ -815,7 +815,6 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_ret_f16: @@ -1129,7 +1128,6 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_ret_f16__offset: @@ -1450,7 +1448,6 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_noret_f16: @@ -1753,7 +1750,6 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_noret_f16__offset: @@ -2056,7 +2052,6 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_ret_f16__offset__align4: @@ -2300,7 +2295,6 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_noret_f16__offset__align4: @@ -2536,8 +2530,9 @@ define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -2557,7 +2552,6 @@ define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_ret_bf16: @@ -2893,8 +2887,9 @@ define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -2914,7 +2909,6 @@ define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_ret_bf16__offset: @@ -3257,8 +3251,9 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 @@ -3278,7 +3273,6 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_noret_bf16: @@ -3604,6 +3598,7 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 @@ -3623,7 +3618,6 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_noret_bf16__offset: @@ -3950,6 +3944,7 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX12-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -3968,7 +3963,6 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_ret_bf16__offset__align4: @@ -4242,8 +4236,9 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: v_bfe_u32 v3, v2, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -4261,7 +4256,6 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: s_cbranch_execnz .LBB19_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_noret_bf16__offset__align4: @@ -4542,7 +4536,6 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_ret_v2f16: @@ -4814,7 +4807,6 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_ret_v2f16__offset: @@ -5085,7 +5077,6 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX12-NEXT: s_cbranch_execnz .LBB22_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_noret_v2f16: @@ -5347,7 +5338,6 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX12-NEXT: s_cbranch_execnz .LBB23_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_noret_v2f16__offset: @@ -5609,6 +5599,7 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 ; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v3 @@ -5617,6 +5608,7 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 ; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 @@ -5633,7 +5625,6 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_ret_v2bf16: @@ -5986,6 +5977,7 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 ; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v3 @@ -5994,6 +5986,7 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 ; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 @@ -6010,7 +6003,6 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_ret_v2bf16__offset: @@ -6368,8 +6360,10 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 @@ -6386,7 +6380,6 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-NEXT: s_cbranch_execnz .LBB26_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_noret_v2bf16: @@ -6732,8 +6725,10 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 @@ -6750,7 +6745,6 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_noret_v2bf16__ofset: diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll index 282947afa409a..7249b0b1fc0e3 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll @@ -815,7 +815,6 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_ret_f16: @@ -1129,7 +1128,6 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_ret_f16__offset: @@ -1450,7 +1448,6 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_noret_f16: @@ -1753,7 +1750,6 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_noret_f16__offset: @@ -2056,7 +2052,6 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_ret_f16__offset__align4: @@ -2300,7 +2295,6 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_noret_f16__offset__align4: @@ -2536,8 +2530,9 @@ define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -2557,7 +2552,6 @@ define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_ret_bf16: @@ -2893,8 +2887,9 @@ define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -2914,7 +2909,6 @@ define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_ret_bf16__offset: @@ -3257,8 +3251,9 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 @@ -3278,7 +3273,6 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_noret_bf16: @@ -3604,6 +3598,7 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 @@ -3623,7 +3618,6 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_noret_bf16__offset: @@ -3950,6 +3944,7 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX12-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -3968,7 +3963,6 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_ret_bf16__offset__align4: @@ -4242,8 +4236,9 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: v_bfe_u32 v3, v2, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -4261,7 +4256,6 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: s_cbranch_execnz .LBB19_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_noret_bf16__offset__align4: @@ -4542,7 +4536,6 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_ret_v2f16: @@ -4814,7 +4807,6 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_ret_v2f16__offset: @@ -5085,7 +5077,6 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX12-NEXT: s_cbranch_execnz .LBB22_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_noret_v2f16: @@ -5347,7 +5338,6 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX12-NEXT: s_cbranch_execnz .LBB23_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_noret_v2f16__offset: @@ -5609,6 +5599,7 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 ; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v3 @@ -5617,6 +5608,7 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 ; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 @@ -5633,7 +5625,6 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_ret_v2bf16: @@ -5986,6 +5977,7 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 ; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v3 @@ -5994,6 +5986,7 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 ; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 @@ -6010,7 +6003,6 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_ret_v2bf16__offset: @@ -6368,8 +6360,10 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 @@ -6386,7 +6380,6 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-NEXT: s_cbranch_execnz .LBB26_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_noret_v2bf16: @@ -6732,8 +6725,10 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 @@ -6750,7 +6745,6 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_noret_v2bf16__ofset: diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll index 967e972e53e29..65e00c50292dc 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll @@ -42,7 +42,6 @@ define float @local_atomic_fsub_ret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_f32: @@ -254,7 +253,6 @@ define float @local_atomic_fsub_ret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_f32__offset: @@ -465,7 +463,6 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_cbranch_execnz .LBB2_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_f32: @@ -666,7 +663,6 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_cbranch_execnz .LBB3_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_f32__offset: @@ -875,7 +871,6 @@ define double @local_atomic_fsub_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_cbranch_execnz .LBB4_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_f64: @@ -1092,7 +1087,6 @@ define double @local_atomic_fsub_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_cbranch_execnz .LBB5_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_f64__offset: @@ -1308,7 +1302,6 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_cbranch_execnz .LBB6_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_f64: @@ -1514,7 +1507,6 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_cbranch_execnz .LBB7_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_f64__offset: @@ -1740,7 +1732,6 @@ define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v2 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_f16: @@ -2047,7 +2038,6 @@ define half @local_atomic_fsub_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_f16__offset: @@ -2361,7 +2351,6 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_f16: @@ -2656,7 +2645,6 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_f16__offset: @@ -2951,7 +2939,6 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_f16__offset__align4: @@ -3187,7 +3174,6 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_f16__offset__align4: @@ -3416,8 +3402,9 @@ define bfloat @local_atomic_fsub_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -3437,7 +3424,6 @@ define bfloat @local_atomic_fsub_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_bf16: @@ -3771,8 +3757,9 @@ define bfloat @local_atomic_fsub_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -3792,7 +3779,6 @@ define bfloat @local_atomic_fsub_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_bf16__offset: @@ -4133,8 +4119,9 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 @@ -4154,7 +4141,6 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_bf16: @@ -4478,6 +4464,7 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 @@ -4497,7 +4484,6 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_bf16__offset: @@ -4822,6 +4808,7 @@ define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX12-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -4840,7 +4827,6 @@ define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_bf16__offset__align4: @@ -5112,8 +5098,9 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: v_bfe_u32 v3, v2, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -5131,7 +5118,6 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: s_cbranch_execnz .LBB19_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_bf16__offset__align4: @@ -5408,7 +5394,6 @@ define <2 x half> @local_atomic_fsub_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_v2f16: @@ -5665,7 +5650,6 @@ define <2 x half> @local_atomic_fsub_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_v2f16__offset: @@ -5920,7 +5904,6 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX12-NEXT: s_cbranch_execnz .LBB22_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_v2f16: @@ -6165,7 +6148,6 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX12-NEXT: s_cbranch_execnz .LBB23_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_v2f16__offset: @@ -6413,6 +6395,7 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 ; GFX12-NEXT: v_sub_f32_e32 v2, v2, v3 @@ -6421,6 +6404,7 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 ; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 @@ -6437,7 +6421,6 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_v2bf16: @@ -6790,6 +6773,7 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 ; GFX12-NEXT: v_sub_f32_e32 v2, v2, v3 @@ -6798,6 +6782,7 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 ; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 @@ -6814,7 +6799,6 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_v2bf16__offset: @@ -7172,8 +7156,10 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 @@ -7190,7 +7176,6 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-NEXT: s_cbranch_execnz .LBB26_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_v2bf16: @@ -7536,8 +7521,10 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 @@ -7554,7 +7541,6 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_v2bf16__ofset: @@ -7907,7 +7893,6 @@ define float @local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode: @@ -8117,7 +8102,6 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX12-NEXT: s_cbranch_execnz .LBB29_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode: diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll index eaf8809d33fc3..e8744c7828d41 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll @@ -35,10 +35,10 @@ define amdgpu_kernel void @buffer_last_use_load_0(ptr addrspace(7) %in, ptr addr ; GFX12-NEXT: v_readfirstlane_b32 s5, v5 ; GFX12-NEXT: v_readfirstlane_b32 s6, v6 ; GFX12-NEXT: v_readfirstlane_b32 s7, v7 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[6:7] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -47,7 +47,6 @@ define amdgpu_kernel void @buffer_last_use_load_0(ptr addrspace(7) %in, ptr addr ; GFX12-NEXT: buffer_load_b32 v8, v9, s[4:7], null offen th:TH_LOAD_LU ; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX12-NEXT: ; implicit-def: $vgpr9 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB0_1 ; GFX12-NEXT: ; %bb.2: @@ -60,6 +59,7 @@ define amdgpu_kernel void @buffer_last_use_load_0(ptr addrspace(7) %in, ptr addr ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] @@ -73,7 +73,6 @@ define amdgpu_kernel void @buffer_last_use_load_0(ptr addrspace(7) %in, ptr addr ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX12-NEXT: ; implicit-def: $vgpr8 ; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB0_3 ; GFX12-NEXT: ; %bb.4: @@ -117,10 +116,10 @@ define amdgpu_kernel void @buffer_last_use_load_1(ptr addrspace(7) %in, ptr addr ; GFX12-NEXT: v_readfirstlane_b32 s5, v6 ; GFX12-NEXT: v_readfirstlane_b32 s6, v7 ; GFX12-NEXT: v_readfirstlane_b32 s7, v8 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[5:6] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -129,7 +128,6 @@ define amdgpu_kernel void @buffer_last_use_load_1(ptr addrspace(7) %in, ptr addr ; GFX12-NEXT: buffer_load_b32 v0, v9, s[4:7], null offen th:TH_LOAD_LU ; GFX12-NEXT: ; implicit-def: $vgpr5_vgpr6_vgpr7_vgpr8 ; GFX12-NEXT: ; implicit-def: $vgpr9 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB1_1 ; GFX12-NEXT: ; %bb.2: @@ -142,6 +140,7 @@ define amdgpu_kernel void @buffer_last_use_load_1(ptr addrspace(7) %in, ptr addr ; GFX12-NEXT: v_readfirstlane_b32 s5, v2 ; GFX12-NEXT: v_readfirstlane_b32 s6, v3 ; GFX12-NEXT: v_readfirstlane_b32 s7, v4 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] @@ -155,7 +154,6 @@ define amdgpu_kernel void @buffer_last_use_load_1(ptr addrspace(7) %in, ptr addr ; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX12-NEXT: ; implicit-def: $vgpr0 ; GFX12-NEXT: ; implicit-def: $vgpr5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB1_3 ; GFX12-NEXT: ; %bb.4: @@ -200,10 +198,10 @@ define amdgpu_kernel void @buffer_last_use_and_volatile_load(ptr addrspace(7) %i ; GFX12-NEXT: v_readfirstlane_b32 s5, v5 ; GFX12-NEXT: v_readfirstlane_b32 s6, v6 ; GFX12-NEXT: v_readfirstlane_b32 s7, v7 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[6:7] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -212,7 +210,6 @@ define amdgpu_kernel void @buffer_last_use_and_volatile_load(ptr addrspace(7) %i ; GFX12-NEXT: buffer_load_b32 v8, v9, s[4:7], null offen th:TH_LOAD_BYPASS scope:SCOPE_SYS ; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX12-NEXT: ; implicit-def: $vgpr9 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB2_1 ; GFX12-NEXT: ; %bb.2: @@ -225,6 +222,7 @@ define amdgpu_kernel void @buffer_last_use_and_volatile_load(ptr addrspace(7) %i ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] @@ -238,7 +236,6 @@ define amdgpu_kernel void @buffer_last_use_and_volatile_load(ptr addrspace(7) %i ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX12-NEXT: ; implicit-def: $vgpr8 ; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB2_3 ; GFX12-NEXT: ; %bb.4: @@ -281,10 +278,10 @@ define amdgpu_kernel void @buffer_last_use_and_nontemporal_load(ptr addrspace(7) ; GFX12-NEXT: v_readfirstlane_b32 s5, v5 ; GFX12-NEXT: v_readfirstlane_b32 s6, v6 ; GFX12-NEXT: v_readfirstlane_b32 s7, v7 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[6:7] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -293,7 +290,6 @@ define amdgpu_kernel void @buffer_last_use_and_nontemporal_load(ptr addrspace(7) ; GFX12-NEXT: buffer_load_b32 v8, v9, s[4:7], null offen th:TH_LOAD_LU ; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX12-NEXT: ; implicit-def: $vgpr9 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB3_1 ; GFX12-NEXT: ; %bb.2: @@ -306,6 +302,7 @@ define amdgpu_kernel void @buffer_last_use_and_nontemporal_load(ptr addrspace(7) ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] @@ -319,7 +316,6 @@ define amdgpu_kernel void @buffer_last_use_and_nontemporal_load(ptr addrspace(7) ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX12-NEXT: ; implicit-def: $vgpr8 ; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB3_3 ; GFX12-NEXT: ; %bb.4: diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll index 6bd0498a2a4e4..b6ff99214249a 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll @@ -359,10 +359,10 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s5, v5 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s6, v6 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s7, v7 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5] ; GFX12-SDAG-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[6:7] -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-SDAG-NEXT: s_wait_alu 0xfffe @@ -371,7 +371,6 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p ; GFX12-SDAG-NEXT: buffer_load_b32 v8, v9, s[4:7], null offen th:TH_LOAD_NT ; GFX12-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX12-SDAG-NEXT: ; implicit-def: $vgpr9 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe ; GFX12-SDAG-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-SDAG-NEXT: s_cbranch_execnz .LBB0_1 ; GFX12-SDAG-NEXT: ; %bb.2: @@ -384,6 +383,7 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-SDAG-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] @@ -397,7 +397,6 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p ; GFX12-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX12-SDAG-NEXT: ; implicit-def: $vgpr8 ; GFX12-SDAG-NEXT: ; implicit-def: $vgpr4 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe ; GFX12-SDAG-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-SDAG-NEXT: s_cbranch_execnz .LBB0_3 ; GFX12-SDAG-NEXT: ; %bb.4: @@ -426,7 +425,6 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p ; GFX12-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30 ; GFX12-GISEL-NEXT: s_mov_b32 s4, s9 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffe ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-GISEL-NEXT: s_mov_b32 s8, s1 ; GFX12-GISEL-NEXT: s_mov_b32 s5, s2 @@ -792,10 +790,10 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s5, v5 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s6, v6 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s7, v7 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5] ; GFX12-SDAG-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[6:7] -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-SDAG-NEXT: s_wait_alu 0xfffe @@ -804,7 +802,6 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX12-SDAG-NEXT: buffer_load_b32 v8, v9, s[4:7], null offen th:TH_LOAD_NT scope:SCOPE_SYS ; GFX12-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX12-SDAG-NEXT: ; implicit-def: $vgpr9 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe ; GFX12-SDAG-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-SDAG-NEXT: s_cbranch_execnz .LBB1_1 ; GFX12-SDAG-NEXT: ; %bb.2: @@ -817,6 +814,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-SDAG-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] @@ -831,7 +829,6 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX12-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX12-SDAG-NEXT: ; implicit-def: $vgpr8 ; GFX12-SDAG-NEXT: ; implicit-def: $vgpr4 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe ; GFX12-SDAG-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-SDAG-NEXT: s_cbranch_execnz .LBB1_3 ; GFX12-SDAG-NEXT: ; %bb.4: @@ -860,7 +857,6 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX12-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30 ; GFX12-GISEL-NEXT: s_mov_b32 s4, s9 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffe ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-GISEL-NEXT: s_mov_b32 s8, s1 ; GFX12-GISEL-NEXT: s_mov_b32 s5, s2 diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll index 382f1a8c3f431..989ef6f981d9d 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll @@ -193,7 +193,6 @@ define amdgpu_kernel void @caller() { ; GFX12-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX12-SDAG-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX12-SDAG-NEXT: s_mov_b32 s32, 0 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe ; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[12:13] ; GFX12-SDAG-NEXT: s_endpgm ; @@ -207,7 +206,6 @@ define amdgpu_kernel void @caller() { ; GFX12-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX12-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX12-GISEL-NEXT: s_mov_b32 s32, 0 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffe ; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[12:13] ; GFX12-GISEL-NEXT: s_endpgm %idx = call i32 @llvm.amdgcn.workgroup.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll index 7d18739fd0c32..11c62a7312755 100644 --- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll @@ -381,20 +381,25 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 { ; GFX12-NEXT: v_mad_co_u64_u32 v[7:8], null, v0, v13, v[9:10] ; GFX12-NEXT: v_mov_b32_e32 v10, v8 ; GFX12-NEXT: v_mad_co_i64_i32 v[8:9], null, v1, v12, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_add_co_u32 v10, s0, v11, v10 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v11, null, 0, 0, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v13, v0, v[8:9] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v12, v13, v[10:11] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_add_co_u32 v8, vcc_lo, v8, v0 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v1, vcc_lo ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v6, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v7, v3, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v8, v4, vcc_lo +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v9, v5, vcc_lo +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] %sext0 = sext i32 %arg0 to i128 %sext1 = sext i32 %arg1 to i128 @@ -1161,19 +1166,22 @@ define i64 @mad_i64_i32_thrice(i32 %arg0, i32 %arg1, i64 %arg2, i64 %arg3, i64 % ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v0, v1, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, v0, v4 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v5, vcc_lo ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v6 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v7, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-NEXT: v_xor_b32_e32 v2, v2, v4 ; GFX12-NEXT: v_xor_b32_e32 v3, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_xor_b32_e32 v0, v2, v0 ; GFX12-NEXT: v_xor_b32_e32 v1, v3, v1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] %sext0 = sext i32 %arg0 to i64 %sext1 = sext i32 %arg1 to i64 @@ -1249,12 +1257,14 @@ define i64 @mad_i64_i32_secondary_use(i32 %arg0, i32 %arg1, i64 %arg2) #0 { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v0, v1, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_xor_b32_e32 v0, v2, v0 ; GFX12-NEXT: v_xor_b32_e32 v1, v3, v1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] %sext0 = sext i32 %arg0 to i64 %sext1 = sext i32 %arg1 to i64 @@ -1795,11 +1805,14 @@ define i64 @lshr_mad_i64_negative_3(i64 %arg0) #0 { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v2, 0xfffffc00, v2 ; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, 1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] %op = add i64 %arg0, 1 %lsh = lshr i64 %arg0, 32 diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll index 8e436b327cda1..5e74a380e0748 100644 --- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll +++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll @@ -130,7 +130,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc() #0 { ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc: @@ -337,7 +336,6 @@ define void @scalar_mov_materializes_frame_index_dead_scc() #0 { ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: scalar_mov_materializes_frame_index_dead_scc: @@ -773,7 +771,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset() ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset: @@ -941,7 +938,6 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset() #0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset: @@ -1524,7 +1520,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset( ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset: @@ -1734,7 +1729,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset: diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll index 7646197f13175..a2f7d24f9ec87 100644 --- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll @@ -759,7 +759,6 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] %alloca0 = alloca [4096 x i32], align 64, addrspace(5) %alloca1 = alloca i32, align 4, addrspace(5) @@ -1474,7 +1473,6 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] %alloca0 = alloca [4096 x i32], align 16, addrspace(5) @@ -2204,7 +2202,6 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] %alloca0 = alloca [4096 x i32], align 64, addrspace(5) %alloca1 = alloca [4096 x i32], align 4, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.ll index fa15a42aef2ac..64afe3cd01255 100644 --- a/llvm/test/CodeGen/AMDGPU/maximumnum.ll +++ b/llvm/test/CodeGen/AMDGPU/maximumnum.ll @@ -278,17 +278,20 @@ define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -296,19 +299,22 @@ define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) { ; GFX12-NEXT: v_bfe_u32 v3, v2, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo ; GFX12-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo ; GFX12-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX12-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %result = call bfloat @llvm.maximumnum.bf16(bfloat %x, bfloat %y) @@ -391,15 +397,19 @@ define bfloat @v_maximumnum_bf16_nnan(bfloat %x, bfloat %y) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo ; GFX12-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2 ; GFX12-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX12-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll index ab485b1799470..a00af8e5b6582 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll @@ -26,29 +26,26 @@ define amdgpu_kernel void @flat_last_use_load_1(ptr %in, ptr %out) { ; GFX12-LABEL: flat_last_use_load_1: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_mov_b64 s[0:1], s[4:5] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-NEXT: s_mov_b32 s2, 0x3ff -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_and_b32_e64 v0, v0, s2 ; GFX12-NEXT: s_mov_b32 s2, 2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_lshlrev_b32_e64 v1, s2, v0 ; GFX12-NEXT: s_mov_b32 s2, 0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: ; implicit-def: $sgpr2 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-NEXT: v_mov_b32_e32 v2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s3, s4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-NEXT: s_mov_b32 s2, s5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_add_co_u32 v0, s3, s3, v0 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v2, s2, s2, v1, s3 ; GFX12-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX12-NEXT: v_mov_b32_e32 v1, v2 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll index ddc4673a290fe..df5b45dea0c2f 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll @@ -463,29 +463,26 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX12-WGP-LABEL: flat_nontemporal_load_1: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[4:5] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s2 ; GFX12-WGP-NEXT: s_mov_b32 s2, 2 ; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s2, v0 ; GFX12-WGP-NEXT: s_mov_b32 s2, 0 -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: ; implicit-def: $sgpr2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_mov_b32 s3, s4 -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-WGP-NEXT: s_mov_b32 s2, s5 -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_add_co_u32 v0, s3, s3, v0 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff ; GFX12-WGP-NEXT: v_add_co_ci_u32_e64 v2, s2, s2, v1, s3 ; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v2 @@ -499,29 +496,26 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX12-CU-LABEL: flat_nontemporal_load_1: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[4:5] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s2 ; GFX12-CU-NEXT: s_mov_b32 s2, 2 ; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s2, v0 ; GFX12-CU-NEXT: s_mov_b32 s2, 0 -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: ; implicit-def: $sgpr2 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_mov_b32 s3, s4 -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-CU-NEXT: s_mov_b32 s2, s5 -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_add_co_u32 v0, s3, s3, v0 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff ; GFX12-CU-NEXT: v_add_co_ci_u32_e64 v2, s2, s2, v1, s3 ; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v2 @@ -991,18 +985,17 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v3, s0, v0 ; GFX12-WGP-NEXT: s_mov_b32 s0, 0 -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: ; implicit-def: $sgpr0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v4, v0 ; GFX12-WGP-NEXT: s_mov_b32 s1, s2 -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-WGP-NEXT: s_mov_b32 s0, s3 -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_add_co_u32 v0, s1, s1, v0 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff ; GFX12-WGP-NEXT: v_add_co_ci_u32_e64 v3, s0, s0, v1, s1 ; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3 @@ -1025,18 +1018,17 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_lshlrev_b32_e64 v3, s0, v0 ; GFX12-CU-NEXT: s_mov_b32 s0, 0 -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: ; implicit-def: $sgpr0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v4, v0 ; GFX12-CU-NEXT: s_mov_b32 s1, s2 -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-CU-NEXT: s_mov_b32 s0, s3 -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_add_co_u32 v0, s1, s1, v0 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff ; GFX12-CU-NEXT: v_add_co_ci_u32_e64 v3, s0, s0, v1, s1 ; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll index f10715033e433..e1f82a70b4c0a 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll @@ -325,29 +325,26 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX12-WGP-LABEL: flat_nontemporal_load_1: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[4:5] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s2 ; GFX12-WGP-NEXT: s_mov_b32 s2, 2 ; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s2, v0 ; GFX12-WGP-NEXT: s_mov_b32 s2, 0 -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: ; implicit-def: $sgpr2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_mov_b32 s3, s4 -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-WGP-NEXT: s_mov_b32 s2, s5 -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_add_co_u32 v0, s3, s3, v0 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff ; GFX12-WGP-NEXT: v_add_co_ci_u32_e64 v2, s2, s2, v1, s3 ; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v2 @@ -364,29 +361,26 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX12-CU-LABEL: flat_nontemporal_load_1: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[4:5] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s2 ; GFX12-CU-NEXT: s_mov_b32 s2, 2 ; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s2, v0 ; GFX12-CU-NEXT: s_mov_b32 s2, 0 -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: ; implicit-def: $sgpr2 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_mov_b32 s3, s4 -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-CU-NEXT: s_mov_b32 s2, s5 -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_add_co_u32 v0, s3, s3, v0 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff ; GFX12-CU-NEXT: v_add_co_ci_u32_e64 v2, s2, s2, v1, s3 ; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v2 @@ -733,18 +727,17 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v3, s0, v0 ; GFX12-WGP-NEXT: s_mov_b32 s0, 0 -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: ; implicit-def: $sgpr0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v4, v0 ; GFX12-WGP-NEXT: s_mov_b32 s1, s2 -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-WGP-NEXT: s_mov_b32 s0, s3 -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_add_co_u32 v0, s1, s1, v0 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff ; GFX12-WGP-NEXT: v_add_co_ci_u32_e64 v3, s0, s0, v1, s1 ; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3 @@ -772,18 +765,17 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_lshlrev_b32_e64 v3, s0, v0 ; GFX12-CU-NEXT: s_mov_b32 s0, 0 -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: ; implicit-def: $sgpr0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v4, v0 ; GFX12-CU-NEXT: s_mov_b32 s1, s2 -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-CU-NEXT: s_mov_b32 s0, s3 -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_add_co_u32 v0, s1, s1, v0 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff ; GFX12-CU-NEXT: v_add_co_ci_u32_e64 v3, s0, s0, v1, s1 ; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll index 465626078f6c6..5f952b98041f3 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll @@ -28,7 +28,6 @@ define amdgpu_kernel void @global_last_use_load_1(ptr addrspace(1) %in, ptr addr ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_mov_b32 s4, 0x3ff -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_and_b32_e64 v1, v1, s4 ; GFX12-NEXT: s_mov_b32 s4, 2 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -73,7 +72,6 @@ define amdgpu_kernel void @global_last_use_and_nontemporal_load(ptr addrspace(1) ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_mov_b32 s4, 0x3ff -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_and_b32_e64 v1, v1, s4 ; GFX12-NEXT: s_mov_b32 s4, 2 ; GFX12-NEXT: s_wait_alu 0xfffe diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll index f06118a7a6dc9..ebcc900307c46 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll @@ -416,7 +416,6 @@ define amdgpu_kernel void @global_nontemporal_load_1( ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_mov_b32 s4, 0x3ff -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s4 ; GFX12-WGP-NEXT: s_mov_b32 s4, 2 ; GFX12-WGP-NEXT: s_wait_alu 0xfffe @@ -434,7 +433,6 @@ define amdgpu_kernel void @global_nontemporal_load_1( ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_mov_b32 s4, 0x3ff -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s4 ; GFX12-CU-NEXT: s_mov_b32 s4, 2 ; GFX12-CU-NEXT: s_wait_alu 0xfffe @@ -832,7 +830,6 @@ define amdgpu_kernel void @global_nontemporal_store_1( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-WGP-NEXT: s_mov_b32 s3, 0x3ff -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s3 ; GFX12-WGP-NEXT: s_mov_b32 s3, 2 ; GFX12-WGP-NEXT: s_wait_alu 0xfffe @@ -849,7 +846,6 @@ define amdgpu_kernel void @global_nontemporal_store_1( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-CU-NEXT: s_mov_b32 s3, 0x3ff -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s3 ; GFX12-CU-NEXT: s_mov_b32 s3, 2 ; GFX12-CU-NEXT: s_wait_alu 0xfffe diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll index 2bf2e03cb0bd7..7dfd5e60c24f8 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll @@ -309,7 +309,6 @@ define amdgpu_kernel void @global_volatile_load_1( ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_mov_b32 s4, 0x3ff -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s4 ; GFX12-WGP-NEXT: s_mov_b32 s4, 2 ; GFX12-WGP-NEXT: s_wait_alu 0xfffe @@ -329,7 +328,6 @@ define amdgpu_kernel void @global_volatile_load_1( ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_mov_b32 s4, 0x3ff -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s4 ; GFX12-CU-NEXT: s_mov_b32 s4, 2 ; GFX12-CU-NEXT: s_wait_alu 0xfffe @@ -647,7 +645,6 @@ define amdgpu_kernel void @global_volatile_store_1( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-WGP-NEXT: s_mov_b32 s3, 0x3ff -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s3 ; GFX12-WGP-NEXT: s_mov_b32 s3, 2 ; GFX12-WGP-NEXT: s_wait_alu 0xfffe @@ -670,7 +667,6 @@ define amdgpu_kernel void @global_volatile_store_1( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-CU-NEXT: s_mov_b32 s3, 0x3ff -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s3 ; GFX12-CU-NEXT: s_mov_b32 s3, 2 ; GFX12-CU-NEXT: s_wait_alu 0xfffe diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll index 3c485af18166f..e9be38d6d17a3 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll @@ -396,11 +396,10 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s2 ; GFX12-WGP-NEXT: s_mov_b32 s2, 2 -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_lshl_add_u32 v1, v1, s2, s3 ; GFX12-WGP-NEXT: ds_load_b32 v1, v1 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 @@ -414,11 +413,10 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s2 ; GFX12-CU-NEXT: s_mov_b32 s2, 2 -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_lshl_add_u32 v1, v1, s2, s3 ; GFX12-CU-NEXT: ds_load_b32 v1, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 @@ -774,7 +772,6 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_mov_b32 s1, 0x3ff -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s1 ; GFX12-WGP-NEXT: s_mov_b32 s1, 2 ; GFX12-WGP-NEXT: s_wait_alu 0xfffe @@ -791,7 +788,6 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX12-CU-NEXT: s_mov_b32 s1, 0x3ff -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s1 ; GFX12-CU-NEXT: s_mov_b32 s1, 2 ; GFX12-CU-NEXT: s_wait_alu 0xfffe diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll index 9740e0ae1d167..9e5f5fcffca9f 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll @@ -276,11 +276,10 @@ define amdgpu_kernel void @local_volatile_load_1( ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s2 ; GFX12-WGP-NEXT: s_mov_b32 s2, 2 -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_lshl_add_u32 v1, v1, s2, s3 ; GFX12-WGP-NEXT: ds_load_b32 v1, v1 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 @@ -294,11 +293,10 @@ define amdgpu_kernel void @local_volatile_load_1( ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s2 ; GFX12-CU-NEXT: s_mov_b32 s2, 2 -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_lshl_add_u32 v1, v1, s2, s3 ; GFX12-CU-NEXT: ds_load_b32 v1, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 @@ -552,7 +550,6 @@ define amdgpu_kernel void @local_volatile_store_1( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_mov_b32 s1, 0x3ff -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s1 ; GFX12-WGP-NEXT: s_mov_b32 s1, 2 ; GFX12-WGP-NEXT: s_wait_alu 0xfffe @@ -574,7 +571,6 @@ define amdgpu_kernel void @local_volatile_store_1( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX12-CU-NEXT: s_mov_b32 s1, 0x3ff -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s1 ; GFX12-CU-NEXT: s_mov_b32 s1, 2 ; GFX12-CU-NEXT: s_wait_alu 0xfffe diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll index f988a4d33add9..bc905fa564f8a 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll @@ -27,7 +27,6 @@ define amdgpu_kernel void @private_last_use_load_1(ptr addrspace(5) %in, ptr add ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_mov_b32 s3, 0x3ff -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX12-NEXT: s_mov_b32 s3, 2 ; GFX12-NEXT: s_wait_alu 0xfffe diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll index 71f28efd47811..6feab49ed86b6 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll @@ -418,7 +418,6 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_mov_b32 s3, 0x3ff -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX12-WGP-NEXT: s_mov_b32 s3, 2 ; GFX12-WGP-NEXT: s_wait_alu 0xfffe @@ -436,7 +435,6 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_mov_b32 s3, 0x3ff -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX12-CU-NEXT: s_mov_b32 s3, 2 ; GFX12-CU-NEXT: s_wait_alu 0xfffe @@ -818,7 +816,6 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s2 ; GFX12-WGP-NEXT: s_mov_b32 s2, 2 ; GFX12-WGP-NEXT: s_wait_alu 0xfffe @@ -835,7 +832,6 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s2 ; GFX12-CU-NEXT: s_mov_b32 s2, 2 ; GFX12-CU-NEXT: s_wait_alu 0xfffe diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll index 3346a034f963f..f8fb7986938f2 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll @@ -304,7 +304,6 @@ define amdgpu_kernel void @private_volatile_load_1( ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_mov_b32 s3, 0x3ff -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX12-WGP-NEXT: s_mov_b32 s3, 2 ; GFX12-WGP-NEXT: s_wait_alu 0xfffe @@ -324,7 +323,6 @@ define amdgpu_kernel void @private_volatile_load_1( ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_mov_b32 s3, 0x3ff -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX12-CU-NEXT: s_mov_b32 s3, 2 ; GFX12-CU-NEXT: s_wait_alu 0xfffe @@ -624,7 +622,6 @@ define amdgpu_kernel void @private_volatile_store_1( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s2 ; GFX12-WGP-NEXT: s_mov_b32 s2, 2 ; GFX12-WGP-NEXT: s_wait_alu 0xfffe @@ -647,7 +644,6 @@ define amdgpu_kernel void @private_volatile_store_1( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s2 ; GFX12-CU-NEXT: s_mov_b32 s2, 2 ; GFX12-CU-NEXT: s_wait_alu 0xfffe diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-store-infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-store-infinite-loop.ll index 7059e80d5c3d1..db1399cc74dc6 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-store-infinite-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-store-infinite-loop.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn--amdhsa < %s | FileCheck -check-prefix=GCN %s ; Effectively, check that the compile finishes; in the case diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.ll index f5fb85d63b8e4..a32b3b71cd606 100644 --- a/llvm/test/CodeGen/AMDGPU/minimumnum.ll +++ b/llvm/test/CodeGen/AMDGPU/minimumnum.ll @@ -280,17 +280,20 @@ define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -298,19 +301,22 @@ define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) { ; GFX12-NEXT: v_bfe_u32 v3, v2, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo ; GFX12-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo ; GFX12-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX12-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %result = call bfloat @llvm.minimumnum.bf16(bfloat %x, bfloat %y) @@ -395,15 +401,19 @@ define bfloat @v_minimumnum_bf16_nnan(bfloat %x, bfloat %y) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo ; GFX12-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2 ; GFX12-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX12-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll index b8be5b300bb7b..2c2058473e235 100644 --- a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll +++ b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll @@ -620,9 +620,11 @@ define i8 @flat_inst_valu_offset_2x_24bit_max(ptr %p) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8388606 ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_24bit_max: @@ -651,9 +653,11 @@ define i8 @flat_inst_valu_offset_2x_24bit_max(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffffe, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 16777214 %load = load i8, ptr %gep, align 4 @@ -825,9 +829,11 @@ define i8 @flat_inst_valu_offset_2x_neg_24bit_max(ptr %p) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xff800000, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8388607 ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-GISEL-LABEL: flat_inst_valu_offset_2x_neg_24bit_max: @@ -838,9 +844,11 @@ define i8 @flat_inst_valu_offset_2x_neg_24bit_max(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xff000001, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 -16777215 %load = load i8, ptr %gep, align 4 @@ -884,9 +892,11 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split0(ptr %p) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:2047 ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_split0: @@ -915,9 +925,11 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split0(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 8589936639 %load = load i8, ptr %gep, align 4 @@ -961,9 +973,11 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split1(ptr %p) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:2048 ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_split1: @@ -992,9 +1006,11 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split1(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 8589936640 %load = load i8, ptr %gep, align 4 @@ -1038,9 +1054,11 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split0(ptr %p) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095 ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split0: @@ -1069,9 +1087,11 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split0(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 8589938687 %load = load i8, ptr %gep, align 4 @@ -1115,9 +1135,11 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split1(ptr %p) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4096 ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split1: @@ -1128,9 +1150,11 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split1(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 8589938688 %load = load i8, ptr %gep, align 4 @@ -1174,9 +1198,11 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split0(ptr %p) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8191 ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split0: @@ -1205,9 +1231,11 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split0(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 8589942783 %load = load i8, ptr %gep, align 4 @@ -1251,9 +1279,11 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split1(ptr %p) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8192 ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split1: @@ -1264,9 +1294,11 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split1(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 8589942784 %load = load i8, ptr %gep, align 4 @@ -1311,9 +1343,11 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(ptr %p) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8386561 ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0: @@ -1334,9 +1368,11 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 -9223372036854773761 %load = load i8, ptr %gep, align 4 @@ -1381,9 +1417,11 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(ptr %p) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8386560 ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1: @@ -1404,9 +1442,11 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 -9223372036854773760 %load = load i8, ptr %gep, align 4 @@ -1451,9 +1491,11 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(ptr %p) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8384513 ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0: @@ -1474,9 +1516,11 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 -9223372036854771713 %load = load i8, ptr %gep, align 4 @@ -1521,9 +1565,11 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(ptr %p) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8384512 ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1: @@ -1544,9 +1590,11 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 -9223372036854771712 %load = load i8, ptr %gep, align 4 @@ -1591,9 +1639,11 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(ptr %p) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8380417 ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0: @@ -1614,9 +1664,11 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 -9223372036854767617 %load = load i8, ptr %gep, align 4 @@ -1661,9 +1713,11 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(ptr %p) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8380416 ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1: @@ -1684,9 +1738,11 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 -9223372036854767616 %load = load i8, ptr %gep, align 4 @@ -2657,7 +2713,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) { ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:2047 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2752,7 +2808,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) { ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:2048 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2847,7 +2903,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) { ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2943,7 +2999,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) { ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4096 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3039,7 +3095,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) { ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8191 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3135,7 +3191,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) { ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8192 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll index fd62ba3f9da1f..d16d731c34384 100644 --- a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll +++ b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll @@ -659,9 +659,11 @@ define i8 @global_inst_valu_offset_2x_24bit_max(ptr addrspace(1) %p) { ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffffe, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_inst_valu_offset_2x_24bit_max: @@ -699,9 +701,11 @@ define i8 @global_inst_valu_offset_2x_24bit_max(ptr addrspace(1) %p) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:8388606 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 16777214 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -869,9 +873,11 @@ define i8 @global_inst_valu_offset_2x_neg_24bit_max(ptr addrspace(1) %p) { ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xff000001, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_inst_valu_offset_2x_neg_24bit_max: @@ -909,9 +915,11 @@ define i8 @global_inst_valu_offset_2x_neg_24bit_max(ptr addrspace(1) %p) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xff800000, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8388607 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 -16777215 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -956,9 +964,11 @@ define i8 @global_inst_valu_offset_64bit_11bit_split0(ptr addrspace(1) %p) { ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_split0: @@ -996,9 +1006,11 @@ define i8 @global_inst_valu_offset_64bit_11bit_split0(ptr addrspace(1) %p) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2047 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589936639 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -1042,9 +1054,11 @@ define i8 @global_inst_valu_offset_64bit_11bit_split1(ptr addrspace(1) %p) { ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_split1: @@ -1073,9 +1087,11 @@ define i8 @global_inst_valu_offset_64bit_11bit_split1(ptr addrspace(1) %p) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2048 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589936640 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -1119,9 +1135,11 @@ define i8 @global_inst_valu_offset_64bit_12bit_split0(ptr addrspace(1) %p) { ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_split0: @@ -1159,9 +1177,11 @@ define i8 @global_inst_valu_offset_64bit_12bit_split0(ptr addrspace(1) %p) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589938687 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -1205,9 +1225,11 @@ define i8 @global_inst_valu_offset_64bit_12bit_split1(ptr addrspace(1) %p) { ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_split1: @@ -1218,9 +1240,11 @@ define i8 @global_inst_valu_offset_64bit_12bit_split1(ptr addrspace(1) %p) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4096 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589938688 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -1264,9 +1288,11 @@ define i8 @global_inst_valu_offset_64bit_13bit_split0(ptr addrspace(1) %p) { ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_split0: @@ -1304,9 +1330,11 @@ define i8 @global_inst_valu_offset_64bit_13bit_split0(ptr addrspace(1) %p) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:8191 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589942783 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -1350,9 +1378,11 @@ define i8 @global_inst_valu_offset_64bit_13bit_split1(ptr addrspace(1) %p) { ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_split1: @@ -1363,9 +1393,11 @@ define i8 @global_inst_valu_offset_64bit_13bit_split1(ptr addrspace(1) %p) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:8192 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589942784 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -1410,9 +1442,11 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(ptr addrspace(1) ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0: @@ -1451,9 +1485,11 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(ptr addrspace(1) ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8386561 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854773761 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -1498,9 +1534,11 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(ptr addrspace(1) ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1: @@ -1530,9 +1568,11 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(ptr addrspace(1) ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8386560 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854773760 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -1577,9 +1617,11 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(ptr addrspace(1) ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0: @@ -1618,9 +1660,11 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(ptr addrspace(1) ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8384513 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854771713 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -1665,9 +1709,11 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(ptr addrspace(1) ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1: @@ -1688,9 +1734,11 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(ptr addrspace(1) ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8384512 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854771712 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -1735,9 +1783,11 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(ptr addrspace(1) ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0: @@ -1776,9 +1826,11 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(ptr addrspace(1) ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8380417 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854767617 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -1823,9 +1875,11 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1) ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1: @@ -1846,9 +1900,11 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1) ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8380416 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854767616 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -2655,7 +2711,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2047 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 @@ -2748,7 +2804,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2048 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 @@ -2841,7 +2897,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 @@ -2934,7 +2990,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4096 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 @@ -3027,7 +3083,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:8191 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 @@ -3120,7 +3176,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:8192 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll index a439f8df10a26..1e6106896e0a4 100644 --- a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll +++ b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll @@ -11,6 +11,7 @@ define amdgpu_cs float @v_s_exp_f32(float inreg %src) { ; GFX12-NEXT: s_add_f32 s0, s0, s1 ; GFX12-NEXT: s_cselect_b32 s1, 0xffffffc0, 0 ; GFX12-NEXT: v_s_exp_f32 s0, s0 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) ; GFX12-NEXT: v_ldexp_f32 v0, s0, s1 ; GFX12-NEXT: ; return to shader part epilog @@ -22,6 +23,7 @@ define amdgpu_cs half @v_s_exp_f16(half inreg %src) { ; GFX12-LABEL: v_s_exp_f16: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_s_exp_f16 s0, s0 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: ; return to shader part epilog @@ -33,6 +35,7 @@ define amdgpu_cs float @v_s_amdgcn_exp_f32(float inreg %src) { ; GFX12-LABEL: v_s_amdgcn_exp_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_s_exp_f32 s0, s0 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: ; return to shader part epilog @@ -44,6 +47,7 @@ define amdgpu_cs half @v_s_amdgcn_exp_f16(half inreg %src) { ; GFX12-LABEL: v_s_amdgcn_exp_f16: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_s_exp_f16 s0, s0 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: ; return to shader part epilog @@ -59,9 +63,8 @@ define amdgpu_cs float @v_s_log_f32(float inreg %src) { ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1 ; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_ldexp_f32 v0, s0, v0 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe ; GFX12-SDAG-NEXT: s_and_b32 s0, s1, exec_lo ; GFX12-SDAG-NEXT: s_cselect_b32 s0, 0x42000000, 0 ; GFX12-SDAG-NEXT: v_log_f32_e32 v0, v0 @@ -92,6 +95,7 @@ define amdgpu_cs half @v_s_log_f16(half inreg %src) { ; GFX12-LABEL: v_s_log_f16: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_s_log_f16 s0, s0 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: ; return to shader part epilog @@ -103,6 +107,7 @@ define amdgpu_cs float @v_s_amdgcn_log_f32(float inreg %src) { ; GFX12-LABEL: v_s_amdgcn_log_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_s_log_f32 s0, s0 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: ; return to shader part epilog @@ -114,6 +119,7 @@ define amdgpu_cs half @v_s_amdgcn_log_f16(half inreg %src) { ; GFX12-LABEL: v_s_amdgcn_log_f16: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_s_log_f16 s0, s0 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: ; return to shader part epilog @@ -125,6 +131,7 @@ define amdgpu_cs float @v_s_rcp_f32(float inreg %src) { ; GFX12-LABEL: v_s_rcp_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_s_rcp_f32 s0, s0 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: ; return to shader part epilog @@ -136,6 +143,7 @@ define amdgpu_cs half @v_s_rcp_f16(half inreg %src) { ; GFX12-LABEL: v_s_rcp_f16: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_s_rcp_f16 s0, s0 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: ; return to shader part epilog @@ -148,6 +156,7 @@ define amdgpu_cs float @v_s_rsq_f32(float inreg %src) { ; GFX12-SDAG-LABEL: v_s_rsq_f32: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: v_s_rsq_f32 s0, s0 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(TRANS32_DEP_1) ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-SDAG-NEXT: ; return to shader part epilog @@ -155,8 +164,10 @@ define amdgpu_cs float @v_s_rsq_f32(float inreg %src) { ; GFX12-GISEL-LABEL: v_s_rsq_f32: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: v_s_sqrt_f32 s0, s0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) ; GFX12-GISEL-NEXT: v_s_rcp_f32 s0, s0 +; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-GISEL-NEXT: ; return to shader part epilog %sqrt = call fast float @llvm.sqrt.f32(float %src) @@ -168,6 +179,7 @@ define amdgpu_cs half @v_s_rsq_f16(half inreg %src) { ; GFX12-LABEL: v_s_rsq_f16: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_s_rsq_f16 s0, s0 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: ; return to shader part epilog @@ -185,7 +197,6 @@ define amdgpu_cs float @v_s_sqrt_f32(float inreg %src) { ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: s_cselect_b32 s1, s1, s0 ; GFX12-SDAG-NEXT: v_s_sqrt_f32 s2, s1 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe ; GFX12-SDAG-NEXT: s_mov_b32 s4, s1 ; GFX12-SDAG-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: s_add_co_i32 s3, s2, -1 @@ -223,7 +234,6 @@ define amdgpu_cs float @v_s_sqrt_f32(float inreg %src) { ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: s_cselect_b32 s0, s2, s0 ; GFX12-GISEL-NEXT: v_s_sqrt_f32 s2, s0 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffe ; GFX12-GISEL-NEXT: s_mov_b32 s4, s0 ; GFX12-GISEL-NEXT: s_mov_b32 s6, s0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -241,11 +251,12 @@ define amdgpu_cs float @v_s_sqrt_f32(float inreg %src) { ; GFX12-GISEL-NEXT: s_cselect_b32 s2, s5, s2 ; GFX12-GISEL-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12-GISEL-NEXT: s_mul_f32 s3, s2, 0x37800000 -; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: s_cselect_b32 s1, s3, s2 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffe ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-GISEL-NEXT: v_cmp_class_f32_e64 s1, s0, 0x260 +; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, s0, s1 ; GFX12-GISEL-NEXT: ; return to shader part epilog %result = call float @llvm.sqrt.f32(float %src) @@ -256,6 +267,7 @@ define amdgpu_cs half @v_s_sqrt_f16(half inreg %src) { ; GFX12-LABEL: v_s_sqrt_f16: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_s_sqrt_f16 s0, s0 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: ; return to shader part epilog @@ -267,6 +279,7 @@ define amdgpu_cs float @v_amdgcn_sqrt_f32(float inreg %src) { ; GFX12-LABEL: v_amdgcn_sqrt_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_s_sqrt_f32 s0, s0 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: ; return to shader part epilog @@ -278,6 +291,7 @@ define amdgpu_cs half @v_amdgcn_sqrt_f16(half inreg %src) { ; GFX12-LABEL: v_amdgcn_sqrt_f16: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_s_sqrt_f16 s0, s0 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: ; return to shader part epilog @@ -296,7 +310,6 @@ define amdgpu_cs float @srcmods_abs_f32(float inreg %src) { ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; GFX12-SDAG-NEXT: v_ldexp_f32 v0, |s0|, v0 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe ; GFX12-SDAG-NEXT: s_and_b32 s0, s1, exec_lo ; GFX12-SDAG-NEXT: s_cselect_b32 s0, 0x42000000, 0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) @@ -333,9 +346,8 @@ define amdgpu_cs float @srcmods_neg_f32(float inreg %src) { ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1 ; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_ldexp_f32 v0, -s0, v0 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe ; GFX12-SDAG-NEXT: s_and_b32 s0, s1, exec_lo ; GFX12-SDAG-NEXT: s_cselect_b32 s0, 0x42000000, 0 ; GFX12-SDAG-NEXT: v_log_f32_e32 v0, v0 @@ -368,6 +380,7 @@ define amdgpu_cs half @srcmods_abs_f16(half inreg %src) { ; GFX12-LABEL: srcmods_abs_f16: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_s_log_f16 s0, |s0| +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: ; return to shader part epilog @@ -380,6 +393,7 @@ define amdgpu_cs half @srcmods_neg_f16(half inreg %src) { ; GFX12-LABEL: srcmods_neg_f16: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_s_log_f16 s0, -s0 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier.ll b/llvm/test/CodeGen/AMDGPU/s-barrier.ll index 83a077f7f74db..1dcc6a19c29d7 100644 --- a/llvm/test/CodeGen/AMDGPU/s-barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/s-barrier.ll @@ -20,7 +20,6 @@ define void @func1() { ; GFX12-SDAG-NEXT: s_mov_b32 m0, 3 ; GFX12-SDAG-NEXT: s_barrier_join m0 ; GFX12-SDAG-NEXT: s_barrier_wait 1 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-GISEL-LABEL: func1: @@ -35,7 +34,6 @@ define void @func1() { ; GFX12-GISEL-NEXT: s_barrier_signal m0 ; GFX12-GISEL-NEXT: s_barrier_join 3 ; GFX12-GISEL-NEXT: s_barrier_wait 1 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffe ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7) call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3) @@ -57,7 +55,6 @@ define void @func2() { ; GFX12-SDAG-NEXT: s_mov_b32 m0, 1 ; GFX12-SDAG-NEXT: s_barrier_join m0 ; GFX12-SDAG-NEXT: s_barrier_wait 1 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-GISEL-LABEL: func2: @@ -72,7 +69,6 @@ define void @func2() { ; GFX12-GISEL-NEXT: s_barrier_signal m0 ; GFX12-GISEL-NEXT: s_barrier_join 1 ; GFX12-GISEL-NEXT: s_barrier_wait 1 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffe ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7) call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2) @@ -94,11 +90,10 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in) ; GFX12-SDAG-NEXT: s_mov_b32 s32, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_lshr_b32 s2, s2, 4 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: s_and_b32 s2, s2, 63 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe ; GFX12-SDAG-NEXT: s_or_b32 s3, 0x90000, s2 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: s_mov_b32 m0, s3 ; GFX12-SDAG-NEXT: s_barrier_init m0 ; GFX12-SDAG-NEXT: s_mov_b32 m0, 0xc0002 @@ -117,16 +112,13 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in) ; GFX12-SDAG-NEXT: s_get_barrier_state s2, m0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_getpc_b64 s[2:3] -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe ; GFX12-SDAG-NEXT: s_sext_i32_i16 s3, s3 -; GFX12-SDAG-NEXT: s_add_co_u32 s2, s2, func1@gotpcrel32@lo+12 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe -; GFX12-SDAG-NEXT: s_add_co_ci_u32 s3, s3, func1@gotpcrel32@hi+24 +; GFX12-SDAG-NEXT: s_add_co_u32 s2, s2, func1@gotpcrel32@lo+8 +; GFX12-SDAG-NEXT: s_add_co_ci_u32 s3, s3, func1@gotpcrel32@hi+16 ; GFX12-SDAG-NEXT: s_barrier_signal -1 ; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-SDAG-NEXT: s_barrier_wait -1 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe ; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX12-SDAG-NEXT: s_getpc_b64 s[2:3] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffe @@ -136,7 +128,6 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in) ; GFX12-SDAG-NEXT: s_add_co_ci_u32 s3, s3, func2@gotpcrel32@hi+24 ; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe ; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX12-SDAG-NEXT: s_get_barrier_state s0, -1 ; GFX12-SDAG-NEXT: s_endpgm @@ -154,11 +145,10 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in) ; GFX12-GISEL-NEXT: s_mov_b32 s32, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_lshr_b32 s0, s0, 4 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: s_and_b32 s0, s0, 63 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffe ; GFX12-GISEL-NEXT: s_or_b32 s1, s0, 0x90000 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: s_mov_b32 m0, s1 ; GFX12-GISEL-NEXT: s_barrier_init m0 ; GFX12-GISEL-NEXT: s_mov_b32 m0, 0xc0002 @@ -178,16 +168,13 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in) ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s9, s13, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_getpc_b64 s[0:1] -; GFX12-GISEL-NEXT: s_wait_alu 0xfffe ; GFX12-GISEL-NEXT: s_sext_i32_i16 s1, s1 -; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, func1@gotpcrel32@lo+12 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffe -; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, func1@gotpcrel32@hi+24 +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, func1@gotpcrel32@lo+8 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, func1@gotpcrel32@hi+16 ; GFX12-GISEL-NEXT: s_barrier_signal -1 ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-GISEL-NEXT: s_barrier_wait -1 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffe ; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX12-GISEL-NEXT: s_add_co_u32 s8, s12, 48 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s9, s13, 0 @@ -199,7 +186,6 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in) ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, func2@gotpcrel32@hi+24 ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffe ; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX12-GISEL-NEXT: s_get_barrier_state s0, -1 ; GFX12-GISEL-NEXT: s_endpgm @@ -226,11 +212,9 @@ define amdgpu_kernel void @kernel2(ptr addrspace(1) %out, ptr addrspace(3) %in) ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX12-SDAG-NEXT: s_getpc_b64 s[6:7] -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe ; GFX12-SDAG-NEXT: s_sext_i32_i16 s7, s7 -; GFX12-SDAG-NEXT: s_add_co_u32 s6, s6, func2@gotpcrel32@lo+12 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe -; GFX12-SDAG-NEXT: s_add_co_ci_u32 s7, s7, func2@gotpcrel32@hi+24 +; GFX12-SDAG-NEXT: s_add_co_u32 s6, s6, func2@gotpcrel32@lo+8 +; GFX12-SDAG-NEXT: s_add_co_ci_u32 s7, s7, func2@gotpcrel32@hi+16 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v31, v0 ; GFX12-SDAG-NEXT: s_load_b64 s[12:13], s[6:7], 0x0 ; GFX12-SDAG-NEXT: s_mov_b32 m0, 0x70002 @@ -243,7 +227,6 @@ define amdgpu_kernel void @kernel2(ptr addrspace(1) %out, ptr addrspace(3) %in) ; GFX12-SDAG-NEXT: s_mov_b32 s32, 0 ; GFX12-SDAG-NEXT: s_barrier_join m0 ; GFX12-SDAG-NEXT: s_barrier_wait 1 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe ; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[12:13] ; GFX12-SDAG-NEXT: s_endpgm ; @@ -252,11 +235,9 @@ define amdgpu_kernel void @kernel2(ptr addrspace(1) %out, ptr addrspace(3) %in) ; GFX12-GISEL-NEXT: s_add_co_u32 s8, s4, 48 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s9, s5, 0 ; GFX12-GISEL-NEXT: s_getpc_b64 s[4:5] -; GFX12-GISEL-NEXT: s_wait_alu 0xfffe ; GFX12-GISEL-NEXT: s_sext_i32_i16 s5, s5 -; GFX12-GISEL-NEXT: s_add_co_u32 s4, s4, func2@gotpcrel32@lo+12 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffe -; GFX12-GISEL-NEXT: s_add_co_ci_u32 s5, s5, func2@gotpcrel32@hi+24 +; GFX12-GISEL-NEXT: s_add_co_u32 s4, s4, func2@gotpcrel32@lo+8 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s5, s5, func2@gotpcrel32@hi+16 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v31, v0 ; GFX12-GISEL-NEXT: s_load_b64 s[12:13], s[4:5], 0x0 ; GFX12-GISEL-NEXT: s_mov_b64 s[10:11], s[6:7] @@ -268,7 +249,6 @@ define amdgpu_kernel void @kernel2(ptr addrspace(1) %out, ptr addrspace(3) %in) ; GFX12-GISEL-NEXT: s_barrier_signal m0 ; GFX12-GISEL-NEXT: s_barrier_join 2 ; GFX12-GISEL-NEXT: s_barrier_wait 1 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffe ; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[12:13] ; GFX12-GISEL-NEXT: s_endpgm call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 7) diff --git a/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll b/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll index b7aecca45def5..dc91a55e8f3c9 100644 --- a/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll +++ b/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll @@ -87,7 +87,6 @@ define void @test_remat_s_getpc_b64() { ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %0 = tail call i64 @llvm.amdgcn.s.getpc() diff --git a/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll b/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll index 50a3336a7483c..9a168c133c552 100644 --- a/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll @@ -30,6 +30,7 @@ define float @v_test_fmin_legacy_ule_f32_safe(float %a, float %b) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule float %a, %b @@ -59,6 +60,7 @@ define float @v_test_fmin_legacy_ule_f32_nnan_flag(float %a, float %b) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule float %a, %b @@ -88,6 +90,7 @@ define float @v_test_fmin_legacy_ule_f32_nsz_flag(float %a, float %b) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule float %a, %b @@ -144,6 +147,7 @@ define float @v_test_fmax_legacy_uge_f32_safe(float %a, float %b) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp uge float %a, %b @@ -173,6 +177,7 @@ define float @v_test_fmax_legacy_uge_f32_nnan_flag(float %a, float %b) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp uge float %a, %b @@ -202,6 +207,7 @@ define float @v_test_fmax_legacy_uge_f32_nsz_flag(float %a, float %b) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp uge float %a, %b @@ -261,8 +267,10 @@ define <2 x float> @v_test_fmin_legacy_ule_v2f32_safe(<2 x float> %a, <2 x float ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v1, v3 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule <2 x float> %a, %b @@ -295,8 +303,10 @@ define <2 x float> @v_test_fmin_legacy_ule_v2f32_nnan_flag(<2 x float> %a, <2 x ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v1, v3 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule <2 x float> %a, %b @@ -329,8 +339,10 @@ define <2 x float> @v_test_fmin_legacy_ule_v2f32_nsz_flag(<2 x float> %a, <2 x f ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v1, v3 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule <2 x float> %a, %b @@ -363,8 +375,10 @@ define <2 x float> @v_test_fmin_legacy_ule_v2f32_nnan_nsz_flag(<2 x float> %a, < ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v1, v3 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule <2 x float> %a, %b @@ -397,8 +411,10 @@ define <2 x float> @v_test_fmax_legacy_uge_v2f32_safe(<2 x float> %a, <2 x float ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v1, v3 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp uge <2 x float> %a, %b @@ -431,8 +447,10 @@ define <2 x float> @v_test_fmax_legacy_uge_v2f32_nnan_flag(<2 x float> %a, <2 x ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v1, v3 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp uge <2 x float> %a, %b @@ -465,8 +483,10 @@ define <2 x float> @v_test_fmax_legacy_uge_v2f32_nsz_flag(<2 x float> %a, <2 x f ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v1, v3 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp uge <2 x float> %a, %b @@ -499,8 +519,10 @@ define <2 x float> @v_test_fmax_legacy_uge_v2f32_nnan_nsz_flag(<2 x float> %a, < ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v1, v3 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp uge <2 x float> %a, %b @@ -534,6 +556,7 @@ define half @v_test_fmin_legacy_ule_f16_safe(half %a, half %b) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule half %a, %b @@ -567,6 +590,7 @@ define half @v_test_fmin_legacy_ule_f16_nnan_flag(half %a, half %b) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule half %a, %b @@ -600,6 +624,7 @@ define half @v_test_fmin_legacy_ule_f16_nsz_flag(half %a, half %b) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule half %a, %b @@ -664,6 +689,7 @@ define half @v_test_fmax_legacy_uge_f16_safe(half %a, half %b) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp uge half %a, %b @@ -697,6 +723,7 @@ define half @v_test_fmax_legacy_uge_f16_nnan_flag(half %a, half %b) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp uge half %a, %b @@ -730,6 +757,7 @@ define half @v_test_fmax_legacy_uge_f16_nsz_flag(half %a, half %b) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp uge half %a, %b @@ -806,11 +834,14 @@ define <2 x half> @v_test_fmin_legacy_ule_v2f16_safe(<2 x half> %a, <2 x half> % ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule <2 x half> %a, %b @@ -856,11 +887,14 @@ define <2 x half> @v_test_fmin_legacy_ule_v2f16_nnan_flag(<2 x half> %a, <2 x ha ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule <2 x half> %a, %b @@ -906,11 +940,14 @@ define <2 x half> @v_test_fmin_legacy_ule_v2f16_nsz_flag(<2 x half> %a, <2 x hal ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule <2 x half> %a, %b @@ -992,11 +1029,14 @@ define <2 x half> @v_test_fmax_legacy_uge_v2f16_safe(<2 x half> %a, <2 x half> % ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp uge <2 x half> %a, %b @@ -1042,11 +1082,14 @@ define <2 x half> @v_test_fmax_legacy_uge_v2f16_nnan_flag(<2 x half> %a, <2 x ha ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp uge <2 x half> %a, %b @@ -1092,11 +1135,14 @@ define <2 x half> @v_test_fmax_legacy_uge_v2f16_nsz_flag(<2 x half> %a, <2 x hal ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp uge <2 x half> %a, %b @@ -1197,14 +1243,18 @@ define <4 x half> @v_test_fmin_legacy_ule_v4f16_safe(<4 x half> %a, <4 x half> % ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v5, v4 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v7, v6 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v3 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 @@ -1272,14 +1322,18 @@ define <4 x half> @v_test_fmin_legacy_ule_v4f16_nnan_flag(<4 x half> %a, <4 x ha ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v5, v4 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v7, v6 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v3 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 @@ -1347,14 +1401,18 @@ define <4 x half> @v_test_fmin_legacy_ule_v4f16_nsz_flag(<4 x half> %a, <4 x hal ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v5, v4 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v7, v6 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v3 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 @@ -1470,14 +1528,18 @@ define <4 x half> @v_test_fmax_legacy_uge_v4f16_safe(<4 x half> %a, <4 x half> % ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v5, v4 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v7, v6 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1, v3 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 @@ -1545,14 +1607,18 @@ define <4 x half> @v_test_fmax_legacy_uge_v4f16_nnan_flag(<4 x half> %a, <4 x ha ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v5, v4 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v7, v6 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1, v3 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 @@ -1620,14 +1686,18 @@ define <4 x half> @v_test_fmax_legacy_uge_v4f16_nsz_flag(<4 x half> %a, <4 x hal ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v5, v4 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v7, v6 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1, v3 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll index 80ccd1ffe0294..001c35ef30cc6 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.ll @@ -858,6 +858,7 @@ define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace ; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v2, v6 ; GFX12-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo ; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v4 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo ; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] ; GFX12-NEXT: s_endpgm @@ -990,10 +991,13 @@ define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace ; GFX12-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_sub_co_u32 v10, vcc_lo, v10, v14 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_sub_co_ci_u32_e32 v11, vcc_lo, v11, v15, vcc_lo ; GFX12-NEXT: v_sub_co_u32 v8, vcc_lo, v8, v12 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v9, v13, vcc_lo ; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v4 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll b/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll index bad0be16e75cc..79ec4b8831679 100644 --- a/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll @@ -69,7 +69,6 @@ define half @swap(half %a, half %b, i32 %i) { ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB0_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %ret ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: swap: @@ -94,7 +93,6 @@ define half @swap(half %a, half %b, i32 %i) { ; GFX12-FAKE16-NEXT: ; %bb.2: ; %ret ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: br label %loop diff --git a/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir b/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir index d91ee54215924..1eabe62e7710e 100644 --- a/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir +++ b/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN,GFX11 %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN,GFX12 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec,amdgpu-wait-sgpr-hazards -o - %s | FileCheck -check-prefixes=GCN,GFX12 %s --- | @mem = internal unnamed_addr addrspace(4) constant [4 x <4 x i32>] [<4 x i32> , <4 x i32> , <4 x i32> , <4 x i32> ] diff --git a/llvm/test/CodeGen/AMDGPU/valu-read-sgpr-hazard-attrs.mir b/llvm/test/CodeGen/AMDGPU/valu-read-sgpr-hazard-attrs.mir new file mode 100644 index 0000000000000..fe3cb30793390 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/valu-read-sgpr-hazard-attrs.mir @@ -0,0 +1,347 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass post-RA-hazard-rec,amdgpu-wait-sgpr-hazards -o - %s | FileCheck -check-prefix=GCN %s + +--- | + define amdgpu_gs void @hazard_disable() #0 { ret void } + define amdgpu_gs void @hazard_enable() #1 { ret void } + define amdgpu_cs void @hazard_calls() #2 { ret void } + define void @hazard_callee1() #2 { ret void } + define void @hazard_callee2() #2 { ret void } + define amdgpu_cs void @hazard_cull_vmem() #3 { ret void } + define amdgpu_cs void @hazard_cull_vmem2() #4 { ret void } + define amdgpu_cs void @hazard_cull_sample() #3 { ret void } + define amdgpu_cs void @hazard_cull_bvh() #3 { ret void } + define amdgpu_cs void @hazard_nocull_scratch() #3 { ret void } + define amdgpu_cs void @hazard_cull_global() #3 { ret void } + define amdgpu_cs void @hazard_nocull_flat() #3 { ret void } + + attributes #0 = { "amdgpu-sgpr-hazard-wait"="0" } + attributes #1 = { "amdgpu-sgpr-hazard-wait"="1" } + attributes #2 = { "amdgpu-sgpr-hazard-boundary-cull" } + attributes #3 = { "amdgpu-sgpr-hazard-mem-wait-cull" "amdgpu-sgpr-hazard-mem-wait-cull-threshold"="1" } + attributes #4 = { "amdgpu-sgpr-hazard-mem-wait-cull" "amdgpu-sgpr-hazard-mem-wait-cull-threshold"="2" } +... + +--- +name: hazard_disable +body: | + bb.0: + ; GCN-LABEL: name: hazard_disable + ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec + ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec + $sgpr0_sgpr1 = S_GETPC_B64 + $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + S_ENDPGM 0 +... + +--- +name: hazard_enable +body: | + bb.0: + ; GCN-LABEL: name: hazard_enable + ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec + ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec + $sgpr0_sgpr1 = S_GETPC_B64 + $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + S_ENDPGM 0 +... + +--- +name: hazard_calls +frameInfo: + hasCalls: true +body: | + ; GCN-LABEL: name: hazard_calls + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 0, $sgpr4, $vgpr0 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 1, $sgpr8, $vgpr0 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 2, $sgpr16, $vgpr0 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 3, $sgpr18, $vgpr0 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 4, $sgpr20, $vgpr0 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 5, $sgpr22, $vgpr0 + ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_CBRANCH_SCC0 %bb.3, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.4 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: $sgpr16 = S_MOV_B32 0 + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: S_SETPC_B64 $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.3: + ; GCN-NEXT: $sgpr18 = S_MOV_B32 0 + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: S_SETPC_B64_return $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.4: + ; GCN-NEXT: successors: %bb.5(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vcc_lo = S_MOV_B32 0 + ; GCN-NEXT: $sgpr20 = S_MOV_B32 0 + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr4_sgpr5 = S_SWAPPC_B64 $sgpr2_sgpr3 + ; GCN-NEXT: $sgpr4 = S_ADD_U32 $sgpr4, 0, implicit-def $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.5: + ; GCN-NEXT: successors: %bb.6(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr8_sgpr9 = S_CALL_B64 0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.6: + ; GCN-NEXT: $sgpr22 = S_MOV_B32 $sgpr8 + ; GCN-NEXT: S_ENDPGM 0 + bb.0: + $vgpr0 = V_WRITELANE_B32 0, $sgpr4, $vgpr0 + $vgpr0 = V_WRITELANE_B32 1, $sgpr8, $vgpr0 + $vgpr0 = V_WRITELANE_B32 2, $sgpr16, $vgpr0 + $vgpr0 = V_WRITELANE_B32 3, $sgpr18, $vgpr0 + $vgpr0 = V_WRITELANE_B32 4, $sgpr20, $vgpr0 + $vgpr0 = V_WRITELANE_B32 5, $sgpr22, $vgpr0 + S_CBRANCH_SCC0 %bb.2, implicit $scc + S_BRANCH %bb.1 + + bb.1: + S_CBRANCH_SCC0 %bb.3, implicit $scc + S_BRANCH %bb.4 + + bb.2: + $sgpr16 = S_MOV_B32 0 + S_SETPC_B64 $sgpr0_sgpr1 + + bb.3: + $sgpr18 = S_MOV_B32 0 + S_SETPC_B64_return $sgpr0_sgpr1 + + bb.4: + $vcc_lo = S_MOV_B32 0 + $sgpr20 = S_MOV_B32 0 + $sgpr4_sgpr5 = S_SWAPPC_B64 $sgpr2_sgpr3 + $sgpr4 = S_ADD_U32 $sgpr4, 0, implicit-def $scc + + bb.5: + $sgpr8_sgpr9 = S_CALL_B64 0 + + bb.6: + $sgpr22 = S_MOV_B32 $sgpr8 + S_ENDPGM 0 +... + +--- +name: hazard_callee1 +body: | + bb.0: + ; GCN-LABEL: name: hazard_callee1 + ; GCN: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + ; GCN-NEXT: S_SETPC_B64_return $sgpr30_sgpr31 + $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + S_SETPC_B64_return $sgpr30_sgpr31 +... + +--- +name: hazard_callee2 +body: | + bb.0: + ; GCN-LABEL: name: hazard_callee2 + ; GCN: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: S_SETPC_B64_return $sgpr30_sgpr31 + $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + S_SETPC_B64_return $sgpr30_sgpr31 +... + +--- +name: hazard_cull_vmem +body: | + bb.0: + ; GCN-LABEL: name: hazard_cull_vmem + ; GCN: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8, 0, 0, 0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: S_WAIT_LOADCNT 0 + ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8, 0, 0, 0, implicit $exec + S_WAIT_LOADCNT 0 + $sgpr0_sgpr1 = S_GETPC_B64 + $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + S_ENDPGM 0 +... + +--- +name: hazard_cull_vmem2 +body: | + bb.0: + ; GCN-LABEL: name: hazard_cull_vmem2 + ; GCN: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8, 0, 0, 0, implicit $exec + ; GCN-NEXT: S_WAIT_LOADCNT 0 + ; GCN-NEXT: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr2, 0, implicit $exec + ; GCN-NEXT: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8, 0, 0, 0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: S_WAIT_LOADCNT 0 + ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8, 0, 0, 0, implicit $exec + S_WAIT_LOADCNT 0 + $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr2, 0, implicit $exec + $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8, 0, 0, 0, implicit $exec + S_WAIT_LOADCNT 0 + $sgpr0_sgpr1 = S_GETPC_B64 + $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + S_ENDPGM 0 +... + +--- +name: hazard_cull_sample +body: | + bb.0: + ; GCN-LABEL: name: hazard_cull_sample + ; GCN: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-NEXT: $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128)) + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: S_WAIT_SAMPLECNT 0 + ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128)) + S_WAIT_SAMPLECNT 0 + $sgpr0_sgpr1 = S_GETPC_B64 + $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + S_ENDPGM 0 +... + +--- +name: hazard_cull_bvh +body: | + bb.0: + ; GCN-LABEL: name: hazard_cull_bvh + ; GCN: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx11 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7) + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: S_WAIT_BVHCNT 0 + ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx11 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7) + S_WAIT_BVHCNT 0 + $sgpr0_sgpr1 = S_GETPC_B64 + $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + S_ENDPGM 0 +... + +--- +name: hazard_nocull_scratch +body: | + bb.0: + ; GCN-LABEL: name: hazard_nocull_scratch + ; GCN: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD $vgpr0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: S_WAIT_LOADCNT 0 + ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + $vgpr0 = SCRATCH_LOAD_DWORD $vgpr0, 0, 0, implicit $exec, implicit $flat_scr + S_WAIT_LOADCNT 0 + $sgpr0_sgpr1 = S_GETPC_B64 + $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + S_ENDPGM 0 +... + +--- +name: hazard_cull_global +body: | + bb.0: + ; GCN-LABEL: name: hazard_cull_global + ; GCN: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: S_WAIT_LOADCNT 0 + ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec + S_WAIT_LOADCNT 0 + $sgpr0_sgpr1 = S_GETPC_B64 + $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + S_ENDPGM 0 +... + +--- +name: hazard_nocull_flat +body: | + bb.0: + ; GCN-LABEL: name: hazard_nocull_flat + ; GCN: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: S_WAIT_LOADCNT 0 + ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr + S_WAIT_LOADCNT 0 + $sgpr0_sgpr1 = S_GETPC_B64 + $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/valu-read-sgpr-hazard.mir b/llvm/test/CodeGen/AMDGPU/valu-read-sgpr-hazard.mir index 2aa16dd904766..04f7e480764e6 100644 --- a/llvm/test/CodeGen/AMDGPU/valu-read-sgpr-hazard.mir +++ b/llvm/test/CodeGen/AMDGPU/valu-read-sgpr-hazard.mir @@ -1,6 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN-O0 %s -# RUN: llc -O2 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN-O2 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass post-RA-hazard-rec,amdgpu-wait-sgpr-hazards -amdgpu-sgpr-hazard-boundary-cull=0 -o - %s | FileCheck -check-prefixes=GCN,NOBC,NOMEMC %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass post-RA-hazard-rec,amdgpu-wait-sgpr-hazards -amdgpu-sgpr-hazard-boundary-cull=1 -o - %s | FileCheck -check-prefixes=GCN,BC,NOMEMC %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass post-RA-hazard-rec,amdgpu-wait-sgpr-hazards -amdgpu-sgpr-hazard-boundary-cull=0 -amdgpu-sgpr-hazard-mem-wait-cull=1 -amdgpu-sgpr-hazard-mem-wait-cull-threshold=1 -o - %s | FileCheck -check-prefixes=GCN,NOBC,MEMC %s --- | @mem = internal unnamed_addr addrspace(4) constant [4 x <4 x i32>] [<4 x i32> , <4 x i32> , <4 x i32> , <4 x i32> ] @@ -12,6 +13,7 @@ define amdgpu_gs void @hazard_vcc1() { ret void } define amdgpu_gs void @hazard_vcc2() { ret void } define amdgpu_gs void @hazard_vcc3() { ret void } + define amdgpu_gs void @hazard_merge_vcc() { ret void } define amdgpu_gs void @hazard_addc1() { ret void } define amdgpu_gs void @hazard_addc2() { ret void } define amdgpu_gs void @hazard_addc3() { ret void } @@ -28,26 +30,33 @@ define amdgpu_gs void @hazard_post_order2() { ret void } define amdgpu_gs void @hazard_post_order_cycle() { ret void } define amdgpu_cs void @hazard_calls() { ret void } + define void @hazard_callee1() { ret void } + define void @hazard_callee2() { ret void } + define amdgpu_cs void @hazard_carry_vcc() { ret void } + define amdgpu_cs void @hazard_carry_vcc_no_hazard() { ret void } + define amdgpu_cs void @hazard_carry_sgpr() { ret void } + define amdgpu_cs void @hazard_carry_sgpr_no_hazard1() { ret void } + define amdgpu_cs void @hazard_carry_sgpr_no_hazard2() { ret void } + define amdgpu_cs void @hazard_carry_sgpr_no_hazard3() { ret void } + define amdgpu_cs void @hazard_cull_vmem() { ret void } + define amdgpu_cs void @hazard_cull_sample() { ret void } + define amdgpu_cs void @hazard_cull_bvh() { ret void } + define amdgpu_cs void @hazard_nocull_scratch() { ret void } + define amdgpu_cs void @hazard_cull_global() { ret void } + define amdgpu_cs void @hazard_nocull_flat() { ret void } + define amdgpu_cs void @hazard_existing_cull() { ret void } ... --- name: hazard_getpc1 body: | bb.0: - ; GCN-O0-LABEL: name: hazard_getpc1 - ; GCN-O0: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec - ; GCN-O0-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: S_ENDPGM 0 - ; - ; GCN-O2-LABEL: name: hazard_getpc1 - ; GCN-O2: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec - ; GCN-O2-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 - ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O2-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc - ; GCN-O2-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: hazard_getpc1 + ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec + ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec $sgpr0_sgpr1 = S_GETPC_B64 $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc @@ -58,20 +67,12 @@ body: | name: hazard_getpc2 body: | bb.0: - ; GCN-O0-LABEL: name: hazard_getpc2 - ; GCN-O0: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr1, implicit $exec - ; GCN-O0-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: S_ENDPGM 0 - ; - ; GCN-O2-LABEL: name: hazard_getpc2 - ; GCN-O2: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr1, implicit $exec - ; GCN-O2-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 - ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O2-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc - ; GCN-O2-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: hazard_getpc2 + ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr1, implicit $exec + ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr1, implicit $exec $sgpr0_sgpr1 = S_GETPC_B64 $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc @@ -82,27 +83,15 @@ body: | name: hazard_getpc3 body: | bb.0: - ; GCN-O0-LABEL: name: hazard_getpc3 - ; GCN-O0: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec - ; GCN-O0-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 { - ; GCN-O0-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 8, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-hi) @mem + 20, implicit-def $scc, implicit $scc - ; GCN-O0-NEXT: } - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: S_ENDPGM 0 - ; - ; GCN-O2-LABEL: name: hazard_getpc3 - ; GCN-O2: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec - ; GCN-O2-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 { - ; GCN-O2-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 - ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O2-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 8, implicit-def $scc - ; GCN-O2-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-hi) @mem + 16, implicit-def $scc, implicit $scc - ; GCN-O2-NEXT: } - ; GCN-O2-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: hazard_getpc3 + ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec + ; GCN-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 { + ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 8, implicit-def $scc + ; GCN-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-hi) @mem + 16, implicit-def $scc, implicit $scc + ; GCN-NEXT: } + ; GCN-NEXT: S_ENDPGM 0 $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec BUNDLE implicit-def $sgpr0_sgpr1 { $sgpr0_sgpr1 = S_GETPC_B64 @@ -116,31 +105,17 @@ body: | name: hazard_getpc4 body: | bb.0: - ; GCN-O0-LABEL: name: hazard_getpc4 - ; GCN-O0: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec - ; GCN-O0-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 { - ; GCN-O0-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr1 = S_SEXT_I32_I16 $sgpr1 - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 16, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-hi) @mem + 28, implicit-def $scc, implicit $scc - ; GCN-O0-NEXT: } - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: S_ENDPGM 0 - ; - ; GCN-O2-LABEL: name: hazard_getpc4 - ; GCN-O2: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec - ; GCN-O2-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 { - ; GCN-O2-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 - ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O2-NEXT: $sgpr1 = S_SEXT_I32_I16 $sgpr1 - ; GCN-O2-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 12, implicit-def $scc - ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O2-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-hi) @mem + 24, implicit-def $scc, implicit $scc - ; GCN-O2-NEXT: } - ; GCN-O2-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: hazard_getpc4 + ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec + ; GCN-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 { + ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr1 = S_SEXT_I32_I16 $sgpr1 + ; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 12, implicit-def $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-hi) @mem + 24, implicit-def $scc, implicit $scc + ; GCN-NEXT: } + ; GCN-NEXT: S_ENDPGM 0 $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec BUNDLE implicit-def $sgpr0_sgpr1 { $sgpr0_sgpr1 = S_GETPC_B64 @@ -155,20 +130,12 @@ body: | name: hazard_vcc1 body: | bb.0: - ; GCN-O0-LABEL: name: hazard_vcc1 - ; GCN-O0: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2, implicit $exec - ; GCN-O0-NEXT: $sgpr3 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: S_ENDPGM 0 - ; - ; GCN-O2-LABEL: name: hazard_vcc1 - ; GCN-O2: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2, implicit $exec - ; GCN-O2-NEXT: $sgpr3 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O2-NEXT: $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc - ; GCN-O2-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: hazard_vcc1 + ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2, implicit $exec + ; GCN-NEXT: $sgpr3 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2, implicit $exec $sgpr3 = S_CSELECT_B32 -1, 0, implicit $scc $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc @@ -179,20 +146,12 @@ body: | name: hazard_vcc2 body: | bb.0: - ; GCN-O0-LABEL: name: hazard_vcc2 - ; GCN-O0: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc_lo, implicit $exec - ; GCN-O0-NEXT: $vcc_lo = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr4 = S_ADD_U32 $vcc_lo, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: S_ENDPGM 0 - ; - ; GCN-O2-LABEL: name: hazard_vcc2 - ; GCN-O2: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc_lo, implicit $exec - ; GCN-O2-NEXT: $vcc_lo = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O2-NEXT: $sgpr4 = S_ADD_U32 $vcc_lo, 0, implicit-def $scc - ; GCN-O2-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: hazard_vcc2 + ; GCN: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc_lo, implicit $exec + ; GCN-NEXT: $vcc_lo = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr4 = S_ADD_U32 $vcc_lo, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec $vcc_lo = S_CSELECT_B32 -1, 0, implicit $scc $sgpr4 = S_ADD_U32 $vcc_lo, 0, implicit-def $scc @@ -203,21 +162,56 @@ body: | name: hazard_vcc3 body: | bb.0: - ; GCN-O0-LABEL: name: hazard_vcc3 - ; GCN-O0: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc_lo, implicit $exec - ; GCN-O0-NEXT: $vcc_lo = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $vgpr3 = V_CNDMASK_B32_e32 $vgpr4, $vgpr5, implicit $vcc_lo, implicit $exec - ; GCN-O0-NEXT: S_ENDPGM 0 - ; - ; GCN-O2-LABEL: name: hazard_vcc3 - ; GCN-O2: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc_lo, implicit $exec - ; GCN-O2-NEXT: $vcc_lo = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O2-NEXT: $vgpr3 = V_CNDMASK_B32_e32 $vgpr4, $vgpr5, implicit $vcc_lo, implicit $exec - ; GCN-O2-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: hazard_vcc3 + ; GCN: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc_lo, implicit $exec + ; GCN-NEXT: $vcc_lo = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $vgpr3 = V_CNDMASK_B32_e32 $vgpr4, $vgpr5, implicit $vcc_lo, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + $vcc_lo = S_CSELECT_B32 -1, 0, implicit $scc + $vgpr3 = V_CNDMASK_B32_e32 $vgpr4, $vgpr5, implicit $vcc, implicit $exec + S_ENDPGM 0 +... + +--- +name: hazard_merge_vcc +body: | + ; GCN-LABEL: name: hazard_merge_vcc + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc_lo, implicit $exec + ; GCN-NEXT: S_CBRANCH_SCC0 %bb.1, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.3(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vcc_lo = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: successors: %bb.3(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vcc_lo = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $mode, implicit $exec + ; GCN-NEXT: S_BRANCH %bb.3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.3: + ; GCN-NEXT: S_WAITCNT_DEPCTR 65532 + ; GCN-NEXT: $vgpr3 = V_CNDMASK_B32_e32 $vgpr4, $vgpr5, implicit $vcc_lo, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + bb.0: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + S_CBRANCH_SCC0 %bb.1, implicit $scc + S_BRANCH %bb.2 + bb.1: $vcc_lo = S_CSELECT_B32 -1, 0, implicit $scc + S_BRANCH %bb.3 + bb.2: + $vcc_lo = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $mode, implicit $exec + S_BRANCH %bb.3 + bb.3: $vgpr3 = V_CNDMASK_B32_e32 $vgpr4, $vgpr5, implicit $vcc, implicit $exec S_ENDPGM 0 ... @@ -226,20 +220,12 @@ body: | name: hazard_addc1 body: | bb.0: - ; GCN-O0-LABEL: name: hazard_addc1 - ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec - ; GCN-O0-NEXT: $sgpr0 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr0, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: S_ENDPGM 0 - ; - ; GCN-O2-LABEL: name: hazard_addc1 - ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec - ; GCN-O2-NEXT: $sgpr0 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr0, 0, implicit-def $scc - ; GCN-O2-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: hazard_addc1 + ; GCN: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-NEXT: $sgpr0 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr2 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec $sgpr0 = S_CSELECT_B32 -1, 0, implicit $scc $sgpr2 = S_ADD_U32 $sgpr0, 0, implicit-def $scc @@ -250,20 +236,12 @@ body: | name: hazard_addc2 body: | bb.0: - ; GCN-O0-LABEL: name: hazard_addc2 - ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr1, 0, implicit $exec - ; GCN-O0-NEXT: $sgpr0 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr0, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: S_ENDPGM 0 - ; - ; GCN-O2-LABEL: name: hazard_addc2 - ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr1, 0, implicit $exec - ; GCN-O2-NEXT: $sgpr0 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr0, 0, implicit-def $scc - ; GCN-O2-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: hazard_addc2 + ; GCN: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr1, 0, implicit $exec + ; GCN-NEXT: $sgpr0 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr2 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr1, 0, implicit $exec $sgpr0 = S_CSELECT_B32 -1, 0, implicit $scc $sgpr2 = S_ADD_U32 $sgpr0, 0, implicit-def $scc @@ -274,20 +252,12 @@ body: | name: hazard_addc3 body: | bb.0: - ; GCN-O0-LABEL: name: hazard_addc3 - ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec - ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: S_ENDPGM 0 - ; - ; GCN-O2-LABEL: name: hazard_addc3 - ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec - ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc - ; GCN-O2-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: hazard_addc3 + ; GCN: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc @@ -298,20 +268,12 @@ body: | name: hazard_addc4 body: | bb.0: - ; GCN-O0-LABEL: name: hazard_addc4 - ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr3, 0, implicit $exec - ; GCN-O0-NEXT: $sgpr3 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr3, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: S_ENDPGM 0 - ; - ; GCN-O2-LABEL: name: hazard_addc4 - ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr3, 0, implicit $exec - ; GCN-O2-NEXT: $sgpr3 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr3, 0, implicit-def $scc - ; GCN-O2-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: hazard_addc4 + ; GCN: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr3, 0, implicit $exec + ; GCN-NEXT: $sgpr3 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr2 = S_ADD_U32 $sgpr3, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr3, 0, implicit $exec $sgpr3 = S_CSELECT_B32 -1, 0, implicit $scc $sgpr2 = S_ADD_U32 $sgpr3, 0, implicit-def $scc @@ -322,26 +284,14 @@ body: | name: hazard_addc5 body: | bb.0: - ; GCN-O0-LABEL: name: hazard_addc5 - ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec - ; GCN-O0-NEXT: $sgpr16 = S_MOV_B32 0 - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr32 = S_MOV_B32 0 - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: S_ENDPGM 0 - ; - ; GCN-O2-LABEL: name: hazard_addc5 - ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec - ; GCN-O2-NEXT: $sgpr16 = S_MOV_B32 0 - ; GCN-O2-NEXT: $sgpr32 = S_MOV_B32 0 - ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc - ; GCN-O2-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: hazard_addc5 + ; GCN: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-NEXT: $sgpr16 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr32 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec $sgpr16 = S_MOV_B32 0 $sgpr32 = S_MOV_B32 0 @@ -354,35 +304,17 @@ body: | name: hazard_addc6 body: | bb.0: - ; GCN-O0-LABEL: name: hazard_addc6 - ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec - ; GCN-O0-NEXT: $sgpr16 = S_MOV_B32 0 - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr32 = S_MOV_B32 0 - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr48 = S_MOV_B32 0 - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr80 = S_MOV_B32 0 - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr96 = S_MOV_B32 0 - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: S_ENDPGM 0 - ; - ; GCN-O2-LABEL: name: hazard_addc6 - ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec - ; GCN-O2-NEXT: $sgpr16 = S_MOV_B32 0 - ; GCN-O2-NEXT: $sgpr32 = S_MOV_B32 0 - ; GCN-O2-NEXT: $sgpr48 = S_MOV_B32 0 - ; GCN-O2-NEXT: $sgpr80 = S_MOV_B32 0 - ; GCN-O2-NEXT: $sgpr96 = S_MOV_B32 0 - ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc - ; GCN-O2-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: hazard_addc6 + ; GCN: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-NEXT: $sgpr16 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr32 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr48 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr80 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr96 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec $sgpr16 = S_MOV_B32 0 $sgpr32 = S_MOV_B32 0 @@ -398,19 +330,12 @@ body: | name: hazard_vaddc1 body: | bb.0: - ; GCN-O0-LABEL: name: hazard_vaddc1 - ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec - ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $vgpr2, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr1, 0, implicit $exec - ; GCN-O0-NEXT: S_ENDPGM 0 - ; - ; GCN-O2-LABEL: name: hazard_vaddc1 - ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec - ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O2-NEXT: $vgpr2, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr1, 0, implicit $exec - ; GCN-O2-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: hazard_vaddc1 + ; GCN: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $vgpr2, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr1, 0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc $vgpr2, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr1, 0, implicit $exec @@ -421,36 +346,20 @@ body: | name: hazard_gap1 body: | bb.0: - ; GCN-O0-LABEL: name: hazard_gap1 - ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec - ; GCN-O0-NEXT: S_NOP 0 - ; GCN-O0-NEXT: S_NOP 0 - ; GCN-O0-NEXT: S_NOP 0 - ; GCN-O0-NEXT: S_NOP 0 - ; GCN-O0-NEXT: S_NOP 0 - ; GCN-O0-NEXT: S_NOP 0 - ; GCN-O0-NEXT: S_NOP 0 - ; GCN-O0-NEXT: S_NOP 0 - ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: S_ENDPGM 0 - ; - ; GCN-O2-LABEL: name: hazard_gap1 - ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec - ; GCN-O2-NEXT: S_NOP 0 - ; GCN-O2-NEXT: S_NOP 0 - ; GCN-O2-NEXT: S_NOP 0 - ; GCN-O2-NEXT: S_NOP 0 - ; GCN-O2-NEXT: S_NOP 0 - ; GCN-O2-NEXT: S_NOP 0 - ; GCN-O2-NEXT: S_NOP 0 - ; GCN-O2-NEXT: S_NOP 0 - ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc - ; GCN-O2-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: hazard_gap1 + ; GCN: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec S_NOP 0 S_NOP 0 @@ -469,36 +378,20 @@ body: | name: hazard_gap2 body: | bb.0: - ; GCN-O0-LABEL: name: hazard_gap2 - ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec - ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: S_NOP 0 - ; GCN-O0-NEXT: S_NOP 0 - ; GCN-O0-NEXT: S_NOP 0 - ; GCN-O0-NEXT: S_NOP 0 - ; GCN-O0-NEXT: S_NOP 0 - ; GCN-O0-NEXT: S_NOP 0 - ; GCN-O0-NEXT: S_NOP 0 - ; GCN-O0-NEXT: S_NOP 0 - ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: S_ENDPGM 0 - ; - ; GCN-O2-LABEL: name: hazard_gap2 - ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec - ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O2-NEXT: S_NOP 0 - ; GCN-O2-NEXT: S_NOP 0 - ; GCN-O2-NEXT: S_NOP 0 - ; GCN-O2-NEXT: S_NOP 0 - ; GCN-O2-NEXT: S_NOP 0 - ; GCN-O2-NEXT: S_NOP 0 - ; GCN-O2-NEXT: S_NOP 0 - ; GCN-O2-NEXT: S_NOP 0 - ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc - ; GCN-O2-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: hazard_gap2 + ; GCN: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc S_NOP 0 @@ -517,41 +410,19 @@ body: | name: hazard_gap3 body: | bb.0: - ; GCN-O0-LABEL: name: hazard_gap3 - ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec - ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr6 = S_ADD_U32 $sgpr5, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr8 = S_ADD_U32 $sgpr7, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr10 = S_ADD_U32 $sgpr9, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr12 = S_ADD_U32 $sgpr11, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr14 = S_ADD_U32 $sgpr13, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr16 = S_ADD_U32 $sgpr15, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: S_ENDPGM 0 - ; - ; GCN-O2-LABEL: name: hazard_gap3 - ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec - ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O2-NEXT: $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc - ; GCN-O2-NEXT: $sgpr6 = S_ADD_U32 $sgpr5, 0, implicit-def $scc - ; GCN-O2-NEXT: $sgpr8 = S_ADD_U32 $sgpr7, 0, implicit-def $scc - ; GCN-O2-NEXT: $sgpr10 = S_ADD_U32 $sgpr9, 0, implicit-def $scc - ; GCN-O2-NEXT: $sgpr12 = S_ADD_U32 $sgpr11, 0, implicit-def $scc - ; GCN-O2-NEXT: $sgpr14 = S_ADD_U32 $sgpr13, 0, implicit-def $scc - ; GCN-O2-NEXT: $sgpr16 = S_ADD_U32 $sgpr15, 0, implicit-def $scc - ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc - ; GCN-O2-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: hazard_gap3 + ; GCN: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc + ; GCN-NEXT: $sgpr6 = S_ADD_U32 $sgpr5, 0, implicit-def $scc + ; GCN-NEXT: $sgpr8 = S_ADD_U32 $sgpr7, 0, implicit-def $scc + ; GCN-NEXT: $sgpr10 = S_ADD_U32 $sgpr9, 0, implicit-def $scc + ; GCN-NEXT: $sgpr12 = S_ADD_U32 $sgpr11, 0, implicit-def $scc + ; GCN-NEXT: $sgpr14 = S_ADD_U32 $sgpr13, 0, implicit-def $scc + ; GCN-NEXT: $sgpr16 = S_ADD_U32 $sgpr15, 0, implicit-def $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc @@ -569,49 +440,22 @@ body: | name: hazard_gap4_no_hazard body: | bb.0: - ; GCN-O0-LABEL: name: hazard_gap4_no_hazard - ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec - ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr6 = S_ADD_U32 $sgpr5, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr8 = S_ADD_U32 $sgpr7, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr10 = S_ADD_U32 $sgpr9, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr12 = S_ADD_U32 $sgpr11, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr14 = S_ADD_U32 $sgpr13, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr16 = S_ADD_U32 $sgpr15, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr18 = S_ADD_U32 $sgpr17, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr20 = S_ADD_U32 $sgpr19, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr22 = S_ADD_U32 $sgpr21, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: S_ENDPGM 0 - ; - ; GCN-O2-LABEL: name: hazard_gap4_no_hazard - ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec - ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O2-NEXT: $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc - ; GCN-O2-NEXT: $sgpr6 = S_ADD_U32 $sgpr5, 0, implicit-def $scc - ; GCN-O2-NEXT: $sgpr8 = S_ADD_U32 $sgpr7, 0, implicit-def $scc - ; GCN-O2-NEXT: $sgpr10 = S_ADD_U32 $sgpr9, 0, implicit-def $scc - ; GCN-O2-NEXT: $sgpr12 = S_ADD_U32 $sgpr11, 0, implicit-def $scc - ; GCN-O2-NEXT: $sgpr14 = S_ADD_U32 $sgpr13, 0, implicit-def $scc - ; GCN-O2-NEXT: $sgpr16 = S_ADD_U32 $sgpr15, 0, implicit-def $scc - ; GCN-O2-NEXT: $sgpr18 = S_ADD_U32 $sgpr17, 0, implicit-def $scc - ; GCN-O2-NEXT: $sgpr20 = S_ADD_U32 $sgpr19, 0, implicit-def $scc - ; GCN-O2-NEXT: $sgpr22 = S_ADD_U32 $sgpr21, 0, implicit-def $scc - ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc - ; GCN-O2-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: hazard_gap4_no_hazard + ; GCN: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc + ; GCN-NEXT: $sgpr6 = S_ADD_U32 $sgpr5, 0, implicit-def $scc + ; GCN-NEXT: $sgpr8 = S_ADD_U32 $sgpr7, 0, implicit-def $scc + ; GCN-NEXT: $sgpr10 = S_ADD_U32 $sgpr9, 0, implicit-def $scc + ; GCN-NEXT: $sgpr12 = S_ADD_U32 $sgpr11, 0, implicit-def $scc + ; GCN-NEXT: $sgpr14 = S_ADD_U32 $sgpr13, 0, implicit-def $scc + ; GCN-NEXT: $sgpr16 = S_ADD_U32 $sgpr15, 0, implicit-def $scc + ; GCN-NEXT: $sgpr18 = S_ADD_U32 $sgpr17, 0, implicit-def $scc + ; GCN-NEXT: $sgpr20 = S_ADD_U32 $sgpr19, 0, implicit-def $scc + ; GCN-NEXT: $sgpr22 = S_ADD_U32 $sgpr21, 0, implicit-def $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc @@ -632,19 +476,11 @@ body: | name: hazard_valu_write1_no_hazard body: | bb.0: - ; GCN-O0-LABEL: name: hazard_valu_write1_no_hazard - ; GCN-O0: $sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $mode, implicit $exec - ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: S_ENDPGM 0 - ; - ; GCN-O2-LABEL: name: hazard_valu_write1_no_hazard - ; GCN-O2: $sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $mode, implicit $exec - ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc - ; GCN-O2-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: hazard_valu_write1_no_hazard + ; GCN: $sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $mode, implicit $exec + ; GCN-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 $sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $mode, implicit $exec $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc @@ -655,19 +491,11 @@ body: | name: hazard_post_order1 body: | bb.0: - ; GCN-O0-LABEL: name: hazard_post_order1 - ; GCN-O0: $sgpr0_sgpr1 = S_GETPC_B64 - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec - ; GCN-O0-NEXT: S_ENDPGM 0 - ; - ; GCN-O2-LABEL: name: hazard_post_order1 - ; GCN-O2: $sgpr0_sgpr1 = S_GETPC_B64 - ; GCN-O2-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc - ; GCN-O2-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec - ; GCN-O2-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: hazard_post_order1 + ; GCN: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 $sgpr0_sgpr1 = S_GETPC_B64 $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec @@ -677,31 +505,17 @@ body: | --- name: hazard_post_order2 body: | - ; GCN-O0-LABEL: name: hazard_post_order2 - ; GCN-O0: bb.0: - ; GCN-O0-NEXT: successors: %bb.1(0x80000000) - ; GCN-O0-NEXT: {{ $}} - ; GCN-O0-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: S_BRANCH %bb.1 - ; GCN-O0-NEXT: {{ $}} - ; GCN-O0-NEXT: bb.1: - ; GCN-O0-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec - ; GCN-O0-NEXT: S_ENDPGM 0 - ; - ; GCN-O2-LABEL: name: hazard_post_order2 - ; GCN-O2: bb.0: - ; GCN-O2-NEXT: successors: %bb.1(0x80000000) - ; GCN-O2-NEXT: {{ $}} - ; GCN-O2-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 - ; GCN-O2-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc - ; GCN-O2-NEXT: S_BRANCH %bb.1 - ; GCN-O2-NEXT: {{ $}} - ; GCN-O2-NEXT: bb.1: - ; GCN-O2-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec - ; GCN-O2-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: hazard_post_order2 + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.1(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-NEXT: S_BRANCH %bb.1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 bb.0: $sgpr0_sgpr1 = S_GETPC_B64 $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc @@ -715,50 +529,27 @@ body: | --- name: hazard_post_order_cycle body: | - ; GCN-O0-LABEL: name: hazard_post_order_cycle - ; GCN-O0: bb.0: - ; GCN-O0-NEXT: successors: %bb.1(0x80000000) - ; GCN-O0-NEXT: {{ $}} - ; GCN-O0-NEXT: S_NOP 0 - ; GCN-O0-NEXT: {{ $}} - ; GCN-O0-NEXT: bb.1: - ; GCN-O0-NEXT: successors: %bb.2(0x80000000) - ; GCN-O0-NEXT: {{ $}} - ; GCN-O0-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: {{ $}} - ; GCN-O0-NEXT: bb.2: - ; GCN-O0-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) - ; GCN-O0-NEXT: {{ $}} - ; GCN-O0-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec - ; GCN-O0-NEXT: S_CBRANCH_SCC0 %bb.1, implicit $scc - ; GCN-O0-NEXT: {{ $}} - ; GCN-O0-NEXT: bb.3: - ; GCN-O0-NEXT: S_ENDPGM 0 - ; - ; GCN-O2-LABEL: name: hazard_post_order_cycle - ; GCN-O2: bb.0: - ; GCN-O2-NEXT: successors: %bb.1(0x80000000) - ; GCN-O2-NEXT: {{ $}} - ; GCN-O2-NEXT: S_NOP 0 - ; GCN-O2-NEXT: {{ $}} - ; GCN-O2-NEXT: bb.1: - ; GCN-O2-NEXT: successors: %bb.2(0x80000000) - ; GCN-O2-NEXT: {{ $}} - ; GCN-O2-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 - ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O2-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc - ; GCN-O2-NEXT: {{ $}} - ; GCN-O2-NEXT: bb.2: - ; GCN-O2-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) - ; GCN-O2-NEXT: {{ $}} - ; GCN-O2-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec - ; GCN-O2-NEXT: S_CBRANCH_SCC0 %bb.1, implicit $scc - ; GCN-O2-NEXT: {{ $}} - ; GCN-O2-NEXT: bb.3: - ; GCN-O2-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: hazard_post_order_cycle + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.1(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec + ; GCN-NEXT: S_CBRANCH_SCC0 %bb.1, implicit $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.3: + ; GCN-NEXT: S_ENDPGM 0 bb.0: S_NOP 0 @@ -779,84 +570,485 @@ name: hazard_calls frameInfo: hasCalls: true body: | - ; GCN-O0-LABEL: name: hazard_calls - ; GCN-O0: bb.0: - ; GCN-O0-NEXT: $sgpr16 = S_MOV_B32 0 - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: S_SETPC_B64 $sgpr0_sgpr1 - ; GCN-O0-NEXT: {{ $}} - ; GCN-O0-NEXT: bb.1: - ; GCN-O0-NEXT: $sgpr18 = S_MOV_B32 0 - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: S_SETPC_B64_return $sgpr0_sgpr1 - ; GCN-O0-NEXT: {{ $}} - ; GCN-O0-NEXT: bb.2: - ; GCN-O0-NEXT: successors: %bb.3(0x80000000) - ; GCN-O0-NEXT: {{ $}} - ; GCN-O0-NEXT: $sgpr20 = S_MOV_B32 0 - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr4_sgpr5 = S_SWAPPC_B64 $sgpr2_sgpr3 - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr4 = S_ADD_U32 $sgpr4, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: {{ $}} - ; GCN-O0-NEXT: bb.3: - ; GCN-O0-NEXT: successors: %bb.4(0x80000000) - ; GCN-O0-NEXT: {{ $}} - ; GCN-O0-NEXT: $sgpr8_sgpr9 = S_CALL_B64 0 - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: {{ $}} - ; GCN-O0-NEXT: bb.4: - ; GCN-O0-NEXT: $sgpr22 = S_MOV_B32 $sgpr8 - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: S_ENDPGM 0 + ; NOBC-LABEL: name: hazard_calls + ; NOBC: bb.0: + ; NOBC-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; NOBC-NEXT: {{ $}} + ; NOBC-NEXT: $vgpr0 = V_WRITELANE_B32 0, $sgpr4, $vgpr0 + ; NOBC-NEXT: $vgpr0 = V_WRITELANE_B32 1, $sgpr8, $vgpr0 + ; NOBC-NEXT: $vgpr0 = V_WRITELANE_B32 2, $sgpr16, $vgpr0 + ; NOBC-NEXT: $vgpr0 = V_WRITELANE_B32 3, $sgpr18, $vgpr0 + ; NOBC-NEXT: $vgpr0 = V_WRITELANE_B32 4, $sgpr20, $vgpr0 + ; NOBC-NEXT: $vgpr0 = V_WRITELANE_B32 5, $sgpr22, $vgpr0 + ; NOBC-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc + ; NOBC-NEXT: S_BRANCH %bb.1 + ; NOBC-NEXT: {{ $}} + ; NOBC-NEXT: bb.1: + ; NOBC-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; NOBC-NEXT: {{ $}} + ; NOBC-NEXT: S_CBRANCH_SCC0 %bb.3, implicit $scc + ; NOBC-NEXT: S_BRANCH %bb.4 + ; NOBC-NEXT: {{ $}} + ; NOBC-NEXT: bb.2: + ; NOBC-NEXT: $sgpr16 = S_MOV_B32 0 + ; NOBC-NEXT: S_WAITCNT_DEPCTR 65534 + ; NOBC-NEXT: S_SETPC_B64 $sgpr0_sgpr1 + ; NOBC-NEXT: {{ $}} + ; NOBC-NEXT: bb.3: + ; NOBC-NEXT: $sgpr18 = S_MOV_B32 0 + ; NOBC-NEXT: S_WAITCNT_DEPCTR 65534 + ; NOBC-NEXT: S_SETPC_B64_return $sgpr0_sgpr1 + ; NOBC-NEXT: {{ $}} + ; NOBC-NEXT: bb.4: + ; NOBC-NEXT: successors: %bb.5(0x80000000) + ; NOBC-NEXT: {{ $}} + ; NOBC-NEXT: $vcc_lo = S_MOV_B32 0 + ; NOBC-NEXT: $sgpr20 = S_MOV_B32 0 + ; NOBC-NEXT: S_WAITCNT_DEPCTR 65534 + ; NOBC-NEXT: $sgpr4_sgpr5 = S_SWAPPC_B64 $sgpr2_sgpr3 + ; NOBC-NEXT: S_WAITCNT_DEPCTR 65534 + ; NOBC-NEXT: $sgpr4 = S_ADD_U32 $sgpr4, 0, implicit-def $scc + ; NOBC-NEXT: {{ $}} + ; NOBC-NEXT: bb.5: + ; NOBC-NEXT: successors: %bb.6(0x80000000) + ; NOBC-NEXT: {{ $}} + ; NOBC-NEXT: S_WAITCNT_DEPCTR 65534 + ; NOBC-NEXT: $sgpr8_sgpr9 = S_CALL_B64 0 + ; NOBC-NEXT: {{ $}} + ; NOBC-NEXT: bb.6: + ; NOBC-NEXT: S_WAITCNT_DEPCTR 65534 + ; NOBC-NEXT: $sgpr22 = S_MOV_B32 $sgpr8 + ; NOBC-NEXT: S_ENDPGM 0 ; - ; GCN-O2-LABEL: name: hazard_calls - ; GCN-O2: bb.0: - ; GCN-O2-NEXT: $sgpr16 = S_MOV_B32 0 - ; GCN-O2-NEXT: S_SETPC_B64 $sgpr0_sgpr1 - ; GCN-O2-NEXT: {{ $}} - ; GCN-O2-NEXT: bb.1: - ; GCN-O2-NEXT: $sgpr18 = S_MOV_B32 0 - ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O2-NEXT: S_SETPC_B64_return $sgpr0_sgpr1 - ; GCN-O2-NEXT: {{ $}} - ; GCN-O2-NEXT: bb.2: - ; GCN-O2-NEXT: successors: %bb.3(0x80000000) - ; GCN-O2-NEXT: {{ $}} - ; GCN-O2-NEXT: $sgpr20 = S_MOV_B32 0 - ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O2-NEXT: $sgpr4_sgpr5 = S_SWAPPC_B64 $sgpr2_sgpr3 - ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O2-NEXT: $sgpr4 = S_ADD_U32 $sgpr4, 0, implicit-def $scc - ; GCN-O2-NEXT: {{ $}} - ; GCN-O2-NEXT: bb.3: - ; GCN-O2-NEXT: successors: %bb.4(0x80000000) - ; GCN-O2-NEXT: {{ $}} - ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O2-NEXT: $sgpr8_sgpr9 = S_CALL_B64 0 - ; GCN-O2-NEXT: {{ $}} - ; GCN-O2-NEXT: bb.4: - ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O2-NEXT: $sgpr22 = S_MOV_B32 $sgpr8 - ; GCN-O2-NEXT: S_ENDPGM 0 + ; BC-LABEL: name: hazard_calls + ; BC: bb.0: + ; BC-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; BC-NEXT: {{ $}} + ; BC-NEXT: $vgpr0 = V_WRITELANE_B32 0, $sgpr4, $vgpr0 + ; BC-NEXT: $vgpr0 = V_WRITELANE_B32 1, $sgpr8, $vgpr0 + ; BC-NEXT: $vgpr0 = V_WRITELANE_B32 2, $sgpr16, $vgpr0 + ; BC-NEXT: $vgpr0 = V_WRITELANE_B32 3, $sgpr18, $vgpr0 + ; BC-NEXT: $vgpr0 = V_WRITELANE_B32 4, $sgpr20, $vgpr0 + ; BC-NEXT: $vgpr0 = V_WRITELANE_B32 5, $sgpr22, $vgpr0 + ; BC-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc + ; BC-NEXT: S_BRANCH %bb.1 + ; BC-NEXT: {{ $}} + ; BC-NEXT: bb.1: + ; BC-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; BC-NEXT: {{ $}} + ; BC-NEXT: S_CBRANCH_SCC0 %bb.3, implicit $scc + ; BC-NEXT: S_BRANCH %bb.4 + ; BC-NEXT: {{ $}} + ; BC-NEXT: bb.2: + ; BC-NEXT: $sgpr16 = S_MOV_B32 0 + ; BC-NEXT: DS_NOP implicit $m0, implicit $exec + ; BC-NEXT: DS_NOP implicit $m0, implicit $exec + ; BC-NEXT: DS_NOP implicit $m0, implicit $exec + ; BC-NEXT: DS_NOP implicit $m0, implicit $exec + ; BC-NEXT: S_WAITCNT_DEPCTR 65534 + ; BC-NEXT: S_SETPC_B64 $sgpr0_sgpr1 + ; BC-NEXT: {{ $}} + ; BC-NEXT: bb.3: + ; BC-NEXT: $sgpr18 = S_MOV_B32 0 + ; BC-NEXT: DS_NOP implicit $m0, implicit $exec + ; BC-NEXT: DS_NOP implicit $m0, implicit $exec + ; BC-NEXT: DS_NOP implicit $m0, implicit $exec + ; BC-NEXT: DS_NOP implicit $m0, implicit $exec + ; BC-NEXT: S_WAITCNT_DEPCTR 65534 + ; BC-NEXT: S_SETPC_B64_return $sgpr0_sgpr1 + ; BC-NEXT: {{ $}} + ; BC-NEXT: bb.4: + ; BC-NEXT: successors: %bb.5(0x80000000) + ; BC-NEXT: {{ $}} + ; BC-NEXT: $vcc_lo = S_MOV_B32 0 + ; BC-NEXT: $sgpr20 = S_MOV_B32 0 + ; BC-NEXT: DS_NOP implicit $m0, implicit $exec + ; BC-NEXT: DS_NOP implicit $m0, implicit $exec + ; BC-NEXT: DS_NOP implicit $m0, implicit $exec + ; BC-NEXT: DS_NOP implicit $m0, implicit $exec + ; BC-NEXT: S_WAITCNT_DEPCTR 65534 + ; BC-NEXT: $sgpr4_sgpr5 = S_SWAPPC_B64 $sgpr2_sgpr3 + ; BC-NEXT: $sgpr4 = S_ADD_U32 $sgpr4, 0, implicit-def $scc + ; BC-NEXT: {{ $}} + ; BC-NEXT: bb.5: + ; BC-NEXT: successors: %bb.6(0x80000000) + ; BC-NEXT: {{ $}} + ; BC-NEXT: $sgpr8_sgpr9 = S_CALL_B64 0 + ; BC-NEXT: {{ $}} + ; BC-NEXT: bb.6: + ; BC-NEXT: $sgpr22 = S_MOV_B32 $sgpr8 + ; BC-NEXT: S_ENDPGM 0 bb.0: + $vgpr0 = V_WRITELANE_B32 0, $sgpr4, $vgpr0 + $vgpr0 = V_WRITELANE_B32 1, $sgpr8, $vgpr0 + $vgpr0 = V_WRITELANE_B32 2, $sgpr16, $vgpr0 + $vgpr0 = V_WRITELANE_B32 3, $sgpr18, $vgpr0 + $vgpr0 = V_WRITELANE_B32 4, $sgpr20, $vgpr0 + $vgpr0 = V_WRITELANE_B32 5, $sgpr22, $vgpr0 + S_CBRANCH_SCC0 %bb.2, implicit $scc + S_BRANCH %bb.1 + + bb.1: + S_CBRANCH_SCC0 %bb.3, implicit $scc + S_BRANCH %bb.4 + + bb.2: $sgpr16 = S_MOV_B32 0 S_SETPC_B64 $sgpr0_sgpr1 - bb.1: + bb.3: $sgpr18 = S_MOV_B32 0 S_SETPC_B64_return $sgpr0_sgpr1 - bb.2: + bb.4: + $vcc_lo = S_MOV_B32 0 $sgpr20 = S_MOV_B32 0 $sgpr4_sgpr5 = S_SWAPPC_B64 $sgpr2_sgpr3 $sgpr4 = S_ADD_U32 $sgpr4, 0, implicit-def $scc - bb.3: + bb.5: $sgpr8_sgpr9 = S_CALL_B64 0 - bb.4: + bb.6: $sgpr22 = S_MOV_B32 $sgpr8 S_ENDPGM 0 ... + +--- +name: hazard_callee1 +body: | + bb.0: + ; NOBC-LABEL: name: hazard_callee1 + ; NOBC: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + ; NOBC-NEXT: S_WAITCNT_DEPCTR 65534 + ; NOBC-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + ; NOBC-NEXT: S_WAITCNT_DEPCTR 65534 + ; NOBC-NEXT: S_SETPC_B64_return $sgpr30_sgpr31 + ; + ; BC-LABEL: name: hazard_callee1 + ; BC: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + ; BC-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + ; BC-NEXT: S_SETPC_B64_return $sgpr30_sgpr31 + $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + S_SETPC_B64_return $sgpr30_sgpr31 +... + +--- +name: hazard_callee2 +body: | + bb.0: + ; NOBC-LABEL: name: hazard_callee2 + ; NOBC: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; NOBC-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + ; NOBC-NEXT: S_WAITCNT_DEPCTR 65534 + ; NOBC-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + ; NOBC-NEXT: S_WAITCNT_DEPCTR 65534 + ; NOBC-NEXT: S_SETPC_B64_return $sgpr30_sgpr31 + ; + ; BC-LABEL: name: hazard_callee2 + ; BC: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; BC-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + ; BC-NEXT: S_WAITCNT_DEPCTR 65534 + ; BC-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + ; BC-NEXT: DS_NOP implicit $m0, implicit $exec + ; BC-NEXT: DS_NOP implicit $m0, implicit $exec + ; BC-NEXT: DS_NOP implicit $m0, implicit $exec + ; BC-NEXT: DS_NOP implicit $m0, implicit $exec + ; BC-NEXT: S_SETPC_B64_return $sgpr30_sgpr31 + $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + S_SETPC_B64_return $sgpr30_sgpr31 +... + +--- +name: hazard_carry_vcc +body: | + bb.0: + ; GCN-LABEL: name: hazard_carry_vcc + ; GCN: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $vcc_lo, 0, implicit $exec + ; GCN-NEXT: $vgpr0 = V_ADD_CO_U32_e32 $vgpr0, $vgpr1, implicit-def $vcc_lo, implicit $exec + ; GCN-NEXT: S_WAITCNT_DEPCTR 65533 + ; GCN-NEXT: $vgpr1 = V_ADDC_U32_e32 $vgpr2, $vgpr3, implicit-def $vcc_lo, implicit $vcc_lo, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $vcc_lo, 0, implicit $exec + $vgpr0 = V_ADD_CO_U32_e32 $vgpr0, $vgpr1, implicit-def $vcc, implicit $exec + $vgpr1 = V_ADDC_U32_e32 $vgpr2, $vgpr3, implicit-def $vcc, implicit $vcc, implicit $exec + S_ENDPGM 0 +... + +--- +name: hazard_carry_vcc_no_hazard +body: | + bb.0: + ; GCN-LABEL: name: hazard_carry_vcc_no_hazard + ; GCN: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $vcc_lo, 0, implicit $exec + ; GCN-NEXT: $vgpr0 = V_ADD_CO_U32_e32 $vgpr0, $vgpr1, implicit-def $vcc_lo, implicit $exec + ; GCN-NEXT: $sgpr8 = S_MOV_B32 $vcc_lo + ; GCN-NEXT: $vgpr1 = V_ADDC_U32_e32 $vgpr2, $vgpr3, implicit-def $vcc_lo, implicit $vcc_lo, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $vcc_lo, 0, implicit $exec + $vgpr0 = V_ADD_CO_U32_e32 $vgpr0, $vgpr1, implicit-def $vcc, implicit $exec + $sgpr8 = S_MOV_B32 $vcc_lo + $vgpr1 = V_ADDC_U32_e32 $vgpr2, $vgpr3, implicit-def $vcc, implicit $vcc, implicit $exec + S_ENDPGM 0 +... + +--- +name: hazard_carry_sgpr +body: | + bb.0: + ; GCN-LABEL: name: hazard_carry_sgpr + ; GCN: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-NEXT: $vgpr0, $sgpr0 = V_ADD_CO_U32_e64 $vgpr0, $vgpr1, 0, implicit $exec + ; GCN-NEXT: S_WAITCNT_DEPCTR 61951 + ; GCN-NEXT: $vgpr1, $sgpr1 = V_ADDC_U32_e64 $vgpr2, $vgpr3, $sgpr0, 0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + $vgpr0, $sgpr0 = V_ADD_CO_U32_e64 $vgpr0, $vgpr1, 0, implicit $exec + $vgpr1, $sgpr1 = V_ADDC_U32_e64 $vgpr2, $vgpr3, $sgpr0, 0, implicit $exec + S_ENDPGM 0 +... + +--- +name: hazard_carry_sgpr_no_hazard1 +body: | + bb.0: + ; GCN-LABEL: name: hazard_carry_sgpr_no_hazard1 + ; GCN: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-NEXT: $vgpr0, $sgpr0 = V_ADD_CO_U32_e64 $vgpr0, $vgpr1, 0, implicit $exec + ; GCN-NEXT: $sgpr8 = S_MOV_B32 $sgpr0 + ; GCN-NEXT: $vgpr1, $sgpr1 = V_ADDC_U32_e64 $vgpr2, $vgpr3, $sgpr0, 0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + $vgpr0, $sgpr0 = V_ADD_CO_U32_e64 $vgpr0, $vgpr1, 0, implicit $exec + $sgpr8 = S_MOV_B32 $sgpr0 + $vgpr1, $sgpr1 = V_ADDC_U32_e64 $vgpr2, $vgpr3, $sgpr0, 0, implicit $exec + S_ENDPGM 0 +... + +--- +name: hazard_carry_sgpr_no_hazard2 +body: | + bb.0: + ; GCN-LABEL: name: hazard_carry_sgpr_no_hazard2 + ; GCN: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-NEXT: $vgpr0, $sgpr0 = V_ADD_CO_U32_e64 $vgpr0, $vgpr1, 0, implicit $exec + ; GCN-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8, 0, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr1, $sgpr1 = V_ADDC_U32_e64 $vgpr2, $vgpr3, $sgpr0, 0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + $vgpr0, $sgpr0 = V_ADD_CO_U32_e64 $vgpr0, $vgpr1, 0, implicit $exec + $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8, 0, 0, 0, implicit $exec + $vgpr1, $sgpr1 = V_ADDC_U32_e64 $vgpr2, $vgpr3, $sgpr0, 0, implicit $exec + S_ENDPGM 0 +... + +--- +name: hazard_carry_sgpr_no_hazard3 +body: | + bb.0: + ; GCN-LABEL: name: hazard_carry_sgpr_no_hazard3 + ; GCN: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-NEXT: $vgpr0, $sgpr0 = V_ADD_CO_U32_e64 $vgpr0, $vgpr1, 0, implicit $exec + ; GCN-NEXT: $sgpr8 = S_LOAD_DWORD_IMM $sgpr6_sgpr7, 0, 0 + ; GCN-NEXT: $vgpr1, $sgpr1 = V_ADDC_U32_e64 $vgpr2, $vgpr3, $sgpr0, 0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + $vgpr0, $sgpr0 = V_ADD_CO_U32_e64 $vgpr0, $vgpr1, 0, implicit $exec + $sgpr8 = S_LOAD_DWORD_IMM $sgpr6_sgpr7, 0, 0 + $vgpr1, $sgpr1 = V_ADDC_U32_e64 $vgpr2, $vgpr3, $sgpr0, 0, implicit $exec + S_ENDPGM 0 +... + +--- +name: hazard_cull_vmem +body: | + bb.0: + ; NOMEMC-LABEL: name: hazard_cull_vmem + ; NOMEMC: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; NOMEMC-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8, 0, 0, 0, implicit $exec + ; NOMEMC-NEXT: S_WAIT_LOADCNT 0 + ; NOMEMC-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; NOMEMC-NEXT: S_WAITCNT_DEPCTR 65534 + ; NOMEMC-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; NOMEMC-NEXT: S_ENDPGM 0 + ; + ; MEMC-LABEL: name: hazard_cull_vmem + ; MEMC: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; MEMC-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8, 0, 0, 0, implicit $exec + ; MEMC-NEXT: DS_NOP implicit $m0, implicit $exec + ; MEMC-NEXT: DS_NOP implicit $m0, implicit $exec + ; MEMC-NEXT: DS_NOP implicit $m0, implicit $exec + ; MEMC-NEXT: DS_NOP implicit $m0, implicit $exec + ; MEMC-NEXT: S_WAIT_LOADCNT 0 + ; MEMC-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; MEMC-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; MEMC-NEXT: S_ENDPGM 0 + $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8, 0, 0, 0, implicit $exec + S_WAIT_LOADCNT 0 + $sgpr0_sgpr1 = S_GETPC_B64 + $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + S_ENDPGM 0 +... + +--- +name: hazard_cull_sample +body: | + bb.0: + ; NOMEMC-LABEL: name: hazard_cull_sample + ; NOMEMC: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; NOMEMC-NEXT: $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128)) + ; NOMEMC-NEXT: S_WAIT_SAMPLECNT 0 + ; NOMEMC-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; NOMEMC-NEXT: S_WAITCNT_DEPCTR 65534 + ; NOMEMC-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; NOMEMC-NEXT: S_ENDPGM 0 + ; + ; MEMC-LABEL: name: hazard_cull_sample + ; MEMC: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; MEMC-NEXT: $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128)) + ; MEMC-NEXT: DS_NOP implicit $m0, implicit $exec + ; MEMC-NEXT: DS_NOP implicit $m0, implicit $exec + ; MEMC-NEXT: DS_NOP implicit $m0, implicit $exec + ; MEMC-NEXT: DS_NOP implicit $m0, implicit $exec + ; MEMC-NEXT: S_WAIT_SAMPLECNT 0 + ; MEMC-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; MEMC-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; MEMC-NEXT: S_ENDPGM 0 + $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128)) + S_WAIT_SAMPLECNT 0 + $sgpr0_sgpr1 = S_GETPC_B64 + $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + S_ENDPGM 0 +... + +--- +name: hazard_cull_bvh +body: | + bb.0: + ; NOMEMC-LABEL: name: hazard_cull_bvh + ; NOMEMC: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; NOMEMC-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx11 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7) + ; NOMEMC-NEXT: S_WAIT_BVHCNT 0 + ; NOMEMC-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; NOMEMC-NEXT: S_WAITCNT_DEPCTR 65534 + ; NOMEMC-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; NOMEMC-NEXT: S_ENDPGM 0 + ; + ; MEMC-LABEL: name: hazard_cull_bvh + ; MEMC: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; MEMC-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx11 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7) + ; MEMC-NEXT: DS_NOP implicit $m0, implicit $exec + ; MEMC-NEXT: DS_NOP implicit $m0, implicit $exec + ; MEMC-NEXT: DS_NOP implicit $m0, implicit $exec + ; MEMC-NEXT: DS_NOP implicit $m0, implicit $exec + ; MEMC-NEXT: S_WAIT_BVHCNT 0 + ; MEMC-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; MEMC-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; MEMC-NEXT: S_ENDPGM 0 + $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx11 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7) + S_WAIT_BVHCNT 0 + $sgpr0_sgpr1 = S_GETPC_B64 + $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + S_ENDPGM 0 +... + +--- +name: hazard_nocull_scratch +body: | + bb.0: + ; GCN-LABEL: name: hazard_nocull_scratch + ; GCN: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD $vgpr0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: S_WAIT_LOADCNT 0 + ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + $vgpr0 = SCRATCH_LOAD_DWORD $vgpr0, 0, 0, implicit $exec, implicit $flat_scr + S_WAIT_LOADCNT 0 + $sgpr0_sgpr1 = S_GETPC_B64 + $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + S_ENDPGM 0 +... + +--- +name: hazard_cull_global +body: | + bb.0: + ; NOMEMC-LABEL: name: hazard_cull_global + ; NOMEMC: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; NOMEMC-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec + ; NOMEMC-NEXT: S_WAIT_LOADCNT 0 + ; NOMEMC-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; NOMEMC-NEXT: S_WAITCNT_DEPCTR 65534 + ; NOMEMC-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; NOMEMC-NEXT: S_ENDPGM 0 + ; + ; MEMC-LABEL: name: hazard_cull_global + ; MEMC: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; MEMC-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec + ; MEMC-NEXT: DS_NOP implicit $m0, implicit $exec + ; MEMC-NEXT: DS_NOP implicit $m0, implicit $exec + ; MEMC-NEXT: DS_NOP implicit $m0, implicit $exec + ; MEMC-NEXT: DS_NOP implicit $m0, implicit $exec + ; MEMC-NEXT: S_WAIT_LOADCNT 0 + ; MEMC-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; MEMC-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; MEMC-NEXT: S_ENDPGM 0 + $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec + S_WAIT_LOADCNT 0 + $sgpr0_sgpr1 = S_GETPC_B64 + $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + S_ENDPGM 0 +... + +--- +name: hazard_nocull_flat +body: | + bb.0: + ; GCN-LABEL: name: hazard_nocull_flat + ; GCN: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: S_WAIT_LOADCNT 0 + ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr + S_WAIT_LOADCNT 0 + $sgpr0_sgpr1 = S_GETPC_B64 + $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + S_ENDPGM 0 +... + +--- +name: hazard_existing_cull +body: | + bb.0: + ; GCN-LABEL: name: hazard_existing_cull + ; GCN: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + DS_NOP implicit $m0, implicit $exec + DS_NOP implicit $m0, implicit $exec + DS_NOP implicit $m0, implicit $exec + DS_NOP implicit $m0, implicit $exec + $sgpr0_sgpr1 = S_GETPC_B64 + $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir b/llvm/test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir index e3b96c08348fc..d49381c9b8aff 100644 --- a/llvm/test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir +++ b/llvm/test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir @@ -1,12 +1,11 @@ # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN,GFX12 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN %s # GCN-LABEL: name: hazard_vcmpx_permlane16 # GCN: V_CMPX_LE_F32_nosdst_e32 # GCN: S_ADD_U32 # GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr1, implicit $exec -# GFX12-NEXT: S_WAITCNT_DEPCTR # GCN-NEXT: V_PERMLANE16_B32_e64 --- name: hazard_vcmpx_permlane16 @@ -52,6 +51,7 @@ body: | # GCN: V_CMPX_LE_F32_nosdst_e32 # GCN: V_NOP # GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr1, implicit $exec +# GFX12-NEXT: S_WAITCNT_DEPCTR # GCN-NEXT: V_PERMLANE16_B32_e64 --- name: hazard_vcmpx_permlane16_v_nop @@ -129,7 +129,6 @@ body: | # GCN: V_CMPX_LE_F32_nosdst_e32 # GCN: S_ADD_U32 # GCN-NEXT: dead $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec -# GFX12-NEXT: S_WAITCNT_DEPCTR # GCN-NEXT: V_PERMLANE16_B32_e64 --- name: hazard_vcmpx_permlane16_undef_src @@ -152,7 +151,6 @@ body: | # GCN: V_CMPX_LE_F32_nosdst_e64 # GCN: S_ADD_U32 # GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr1, implicit $exec -# GFX12-NEXT: S_WAITCNT_DEPCTR # GCN-NEXT: V_PERMLANE16_B32_e64 --- name: hazard_vcmpx_e64_permlane16