From 20cb609e803bf02dabd7d2db600bf8405935be06 Mon Sep 17 00:00:00 2001 From: Carl Ritson Date: Tue, 6 May 2025 11:51:47 +0900 Subject: [PATCH 1/3] [AMDGPU] Rework GFX11 VALU Mask Write Hazard Apply additional counter waits to address VALU writes to SGPRs. Rework expiry detection and apply wait coalescing to mitigate some of the additional waits. --- .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 233 ++++++++--- .../atomic_optimizations_global_pointer.ll | 41 +- .../atomic_optimizations_local_pointer.ll | 118 ++++-- .../atomic_optimizations_pixelshader.ll | 1 + llvm/test/CodeGen/AMDGPU/branch-relaxation.ll | 2 +- .../global-saddr-atomics-min-max-system.ll | 16 + .../AMDGPU/global_atomics_scan_fmax.ll | 27 +- .../AMDGPU/global_atomics_scan_fmin.ll | 27 +- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll | 2 + .../CodeGen/AMDGPU/nor-divergent-lanemask.ll | 3 +- llvm/test/CodeGen/AMDGPU/skip-if-dead.ll | 2 + .../CodeGen/AMDGPU/valu-mask-write-hazard.mir | 388 +++++++++++++++++- 12 files changed, 736 insertions(+), 124 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index a911e7e6c01a4..2595fb33eb432 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -3267,29 +3267,106 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) { return false; assert(!ST.hasExtendedWaitCounts()); - if (!ST.isWave64() || !SIInstrInfo::isSALU(*MI)) + if (!ST.isWave64()) + return false; + + const bool IsSALU = SIInstrInfo::isSALU(*MI); + const bool IsVALU = SIInstrInfo::isVALU(*MI); + if (!IsSALU && !IsVALU) return false; // The hazard sequence is three instructions: // 1. VALU reads SGPR as mask - // 2. SALU writes SGPR - // 3. SALU reads SGPR - // The hazard can expire if the distance between 2 and 3 is sufficient. - // In practice this happens <10% of the time, hence this always assumes - // the hazard exists if 1 and 2 are present to avoid searching. + // 2. VALU/SALU writes SGPR + // 3. VALU/SALU reads SGPR + // The hazard can expire if the distance between 2 and 3 is sufficient, + // or (2) is VALU and (3) is SALU. + // In practice this happens <10% of the time, hence always assume the hazard + // exists if (1) and (2) are present to avoid searching all SGPR reads. - const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst); - if (!SDSTOp || !SDSTOp->isReg()) - return false; + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + + auto IgnoreableSGPR = [](const Register Reg) { + switch (Reg) { + case AMDGPU::EXEC: + case AMDGPU::EXEC_LO: + case AMDGPU::EXEC_HI: + case AMDGPU::M0: + case AMDGPU::SGPR_NULL: + case AMDGPU::SGPR_NULL64: + case AMDGPU::SCC: + return true; + default: + return false; + } + }; + auto IsVCC = [](const Register Reg) { + return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI; + }; + + struct StateType { + SmallSet HazardSGPRs; + + static unsigned getHashValue(const StateType &State) { + return hash_combine_range(State.HazardSGPRs); + } + static bool isEqual(const StateType &LHS, const StateType &RHS) { + return LHS.HazardSGPRs == RHS.HazardSGPRs; + } + }; + + SmallVector WaitInstrs; + bool HasSGPRRead = false; + StateType InitialState; + + // Look for SGPR write. + MachineOperand *HazardDef = nullptr; + for (MachineOperand &Op : MI->operands()) { + if (!Op.isReg()) + continue; + if (Op.isDef() && HazardDef) + continue; + + Register Reg = Op.getReg(); + if (IgnoreableSGPR(Reg)) + continue; + if (!IsVCC(Reg)) { + if (Op.isImplicit()) + continue; + if (!TRI->isSGPRReg(MRI, Reg)) + continue; + } + // Also check for SGPR reads. + if (Op.isUse()) { + HasSGPRRead = true; + continue; + } - const Register HazardReg = SDSTOp->getReg(); - if (HazardReg == AMDGPU::EXEC || - HazardReg == AMDGPU::EXEC_LO || - HazardReg == AMDGPU::EXEC_HI || - HazardReg == AMDGPU::M0) + assert(!HazardDef); + HazardDef = &Op; + } + + if (!HazardDef) return false; - auto IsHazardFn = [HazardReg, this](const MachineInstr &I) { + // Setup to track writes to individual SGPRs + const Register HazardReg = HazardDef->getReg(); + if (AMDGPU::SReg_32RegClass.contains(HazardReg)) { + InitialState.HazardSGPRs.insert(HazardReg); + } else if (IsVCC(HazardReg)) { + InitialState.HazardSGPRs.insert(AMDGPU::VCC_LO); + InitialState.HazardSGPRs.insert(AMDGPU::VCC_HI); + } else { + assert(AMDGPU::SReg_64RegClass.contains(HazardReg)); + InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub0)); + InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub1)); + } + + auto IsHazardFn = [&](StateType &State, const MachineInstr &I) { + if (State.HazardSGPRs.empty()) + return HazardExpired; + switch (I.getOpcode()) { case AMDGPU::V_ADDC_U32_e32: case AMDGPU::V_ADDC_U32_dpp: @@ -3304,11 +3381,10 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) { case AMDGPU::V_SUBB_U32_e32: case AMDGPU::V_SUBB_U32_dpp: case AMDGPU::V_SUBBREV_U32_e32: - case AMDGPU::V_SUBBREV_U32_dpp: + case AMDGPU::V_SUBBREV_U32_dpp: { // These implicitly read VCC as mask source. - return HazardReg == AMDGPU::VCC || - HazardReg == AMDGPU::VCC_LO || - HazardReg == AMDGPU::VCC_HI; + return IsVCC(HazardReg) ? HazardFound : NoHazardFound; + } case AMDGPU::V_ADDC_U32_e64: case AMDGPU::V_ADDC_U32_e64_dpp: case AMDGPU::V_CNDMASK_B16_t16_e64: @@ -3324,68 +3400,101 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) { // Only check mask register overlaps. const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2); assert(SSRCOp); - return TRI.regsOverlap(SSRCOp->getReg(), HazardReg); + bool Result = TRI->regsOverlap(SSRCOp->getReg(), HazardReg); + return Result ? HazardFound : NoHazardFound; } default: - return false; + return NoHazardFound; } }; - const MachineRegisterInfo &MRI = MF.getRegInfo(); - auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) { - // s_waitcnt_depctr sa_sdst(0) mitigates hazard. - if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && - AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0) - return true; - - // VALU access to any SGPR or literal constant other than HazardReg - // mitigates hazard. No need to check HazardReg here as this will - // only be called when !IsHazardFn. - if (!SIInstrInfo::isVALU(I)) - return false; - for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) { - const MachineOperand &Op = I.getOperand(OpNo); - if (Op.isReg()) { - Register OpReg = Op.getReg(); - // Only consider uses - if (!Op.isUse()) + auto UpdateStateFn = [&](StateType &State, const MachineInstr &I) { + switch (I.getOpcode()) { + case AMDGPU::S_WAITCNT_DEPCTR: + // Record waits within region of instructions free of SGPR reads. + if (!HasSGPRRead && I.getParent() == MI->getParent()) + WaitInstrs.push_back(&I); + break; + default: + // Update tracking of SGPR reads and writes. + for (auto &Op : I.operands()) { + if (!Op.isReg()) continue; - // Ignore EXEC - if (OpReg == AMDGPU::EXEC || - OpReg == AMDGPU::EXEC_LO || - OpReg == AMDGPU::EXEC_HI) + + Register Reg = Op.getReg(); + if (IgnoreableSGPR(Reg)) continue; - // Ignore all implicit uses except VCC - if (Op.isImplicit()) { - if (OpReg == AMDGPU::VCC || - OpReg == AMDGPU::VCC_LO || - OpReg == AMDGPU::VCC_HI) - return true; + if (!IsVCC(Reg)) { + if (Op.isImplicit()) + continue; + if (!TRI->isSGPRReg(MRI, Reg)) + continue; + } + if (Op.isUse()) { + HasSGPRRead = true; continue; } - if (TRI.isSGPRReg(MRI, OpReg)) - return true; - } else { - const MCInstrDesc &InstDesc = I.getDesc(); - const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo]; - if (!TII.isInlineConstant(Op, OpInfo)) - return true; + + // Stop tracking any SGPRs with writes on the basis that they will + // already have an appropriate wait inserted afterwards. + SmallVector Found; + for (Register SGPR : State.HazardSGPRs) { + if (Reg == SGPR || TRI->regsOverlap(Reg, SGPR)) + Found.push_back(SGPR); + } + for (Register SGPR : Found) + State.HazardSGPRs.erase(SGPR); } + break; } - return false; }; // Check for hazard - if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == - std::numeric_limits::max()) + if (!hasHazard(InitialState, IsHazardFn, UpdateStateFn, + MI->getParent(), + std::next(MI->getReverseIterator()))) return false; - auto NextMI = std::next(MI->getIterator()); + // Compute counter mask + unsigned DepCtr = + IsVALU ? (IsVCC(HazardReg) ? AMDGPU::DepCtr::encodeFieldVaVcc(0) + : AMDGPU::DepCtr::encodeFieldVaSdst(0)) + : AMDGPU::DepCtr::encodeFieldSaSdst(0); + + // Try to merge previous waits into this one for regions with no SGPR reads. + if (WaitInstrs.size()) { + const unsigned ConstantBits = AMDGPU::DepCtr::encodeFieldSaSdst( + AMDGPU::DepCtr::encodeFieldVaSdst(AMDGPU::DepCtr::encodeFieldVaVcc(0), + 0), + 0); + + for (const MachineInstr *Instr : WaitInstrs) { + // Don't touch bundled waits. + if (Instr->isBundled()) + continue; + MachineInstr *WaitMI = const_cast(Instr); + unsigned WaitMask = WaitMI->getOperand(0).getImm(); + // Only work with counters related to this hazard. + if ((WaitMask & ConstantBits) != ConstantBits) + continue; + DepCtr = AMDGPU::DepCtr::encodeFieldSaSdst( + DepCtr, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(WaitMask), + AMDGPU::DepCtr::decodeFieldSaSdst(DepCtr))); + DepCtr = AMDGPU::DepCtr::encodeFieldVaSdst( + DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaSdst(WaitMask), + AMDGPU::DepCtr::decodeFieldVaSdst(DepCtr))); + DepCtr = AMDGPU::DepCtr::encodeFieldVaVcc( + DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaVcc(WaitMask), + AMDGPU::DepCtr::decodeFieldVaVcc(DepCtr))); + WaitMI->eraseFromParent(); + } + } - // Add s_waitcnt_depctr sa_sdst(0) after SALU write. + // Add s_waitcnt_depctr after SGPR write. + auto NextMI = std::next(MI->getIterator()); auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(), TII.get(AMDGPU::S_WAITCNT_DEPCTR)) - .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); + .addImm(DepCtr); // SALU write may be s_getpc in a bundle. updateGetPCBundle(NewMI); diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 258bc2959f391..0216bca5f6cd4 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -3165,17 +3165,19 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf @@ -3183,6 +3185,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 @@ -3195,8 +3198,9 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s2 ; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 31 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -3248,8 +3252,9 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v5 ; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v7 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_add_co_u32 v6, vcc, s2, v8 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: s_mov_b32 s2, s6 ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v7, null, s3, v9, vcc ; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 @@ -5872,6 +5877,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1164-NEXT: buffer_gl1_inv ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX1164-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[10:11] @@ -6400,6 +6406,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1164-NEXT: buffer_gl1_inv ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX1164-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164-NEXT: s_or_b64 s[12:13], vcc, s[12:13] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[12:13] @@ -7004,6 +7011,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: buffer_gl1_inv ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX1164_ITERATIVE-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_ITERATIVE-NEXT: s_or_b64 s[12:13], vcc, s[12:13] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 exec, exec, s[12:13] @@ -7691,17 +7699,19 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf @@ -7709,6 +7719,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 @@ -7721,8 +7732,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s2 ; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 31 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -7770,8 +7782,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, v7 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v6 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_sub_co_u32 v8, vcc, v10, s8 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_subrev_co_ci_u32_e64 v9, null, s9, v11, vcc ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v8 @@ -7783,6 +7796,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: buffer_gl1_inv ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[10:11] +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: s_or_b64 s[12:13], vcc, s[12:13] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: s_and_not1_b64 exec, exec, s[12:13] @@ -7797,8 +7811,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v5 ; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v7 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_sub_co_u32 v6, vcc, s2, v8 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1164_DPP-NEXT: v_sub_co_ci_u32_e64 v7, null, s3, v9, vcc ; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 @@ -12792,6 +12807,7 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1164-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1164-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX1164-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX1164-TRUE16-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v1, v2 ; GFX1164-TRUE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -12848,6 +12864,7 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1164-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1164-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX1164-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX1164-FAKE16-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v1, v2 ; GFX1164-FAKE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -13826,9 +13843,10 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1164-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; GFX1164-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff ; GFX1164-TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1164-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX1164-TRUE16-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX1164-TRUE16-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h @@ -13837,6 +13855,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1164-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1164-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX1164-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX1164-TRUE16-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v1, v2 ; GFX1164-TRUE16-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -13885,6 +13904,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1164-FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff ; GFX1164-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff ; GFX1164-FAKE16-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0 +; GFX1164-FAKE16-NEXT: s_waitcnt_depctr 0xf1ff ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc ; GFX1164-FAKE16-NEXT: v_cndmask_b32_e64 v0, v3, v5, s[0:1] @@ -13895,6 +13915,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1164-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1164-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX1164-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX1164-FAKE16-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v1, v2 ; GFX1164-FAKE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index 23c5f4f5506f3..55140a4b0c2f1 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -963,6 +963,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32 @@ -2660,17 +2661,19 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf @@ -2678,6 +2681,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 @@ -2690,13 +2694,15 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s2 ; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 31 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf @@ -2721,6 +2727,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_writelane_b32 v6, s9, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc @@ -3315,23 +3322,26 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v2, 0, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc ; GFX1164_DPP-NEXT: v_add_co_u32 v2, vcc, v2, v3 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v1, 0, 0 ; GFX1164_DPP-NEXT: v_permlane64_b32 v3, v2 @@ -3339,10 +3349,11 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc ; GFX1164_DPP-NEXT: v_permlane64_b32 v4, v1 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_add_co_u32 v2, vcc, v2, v3 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffc ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v3, null, v1, v4, vcc ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 @@ -4370,6 +4381,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32 @@ -6090,17 +6102,19 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf @@ -6108,6 +6122,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 @@ -6120,13 +6135,15 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s2 ; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 31 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf @@ -6151,6 +6168,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_writelane_b32 v6, s9, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc @@ -6777,6 +6795,7 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32 @@ -7500,6 +7519,7 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s3, v2, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 31 @@ -8136,6 +8156,7 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32 @@ -8858,6 +8879,7 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s3, v2, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 31 @@ -9494,6 +9516,7 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32 @@ -10216,6 +10239,7 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s3, v2, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 31 @@ -10852,6 +10876,7 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32 @@ -11130,6 +11155,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] +; GFX1164-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc ; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 @@ -11965,16 +11991,19 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6] +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v6, 1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[3:4] +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v4, 1 @@ -11982,8 +12011,9 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6] +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 @@ -11993,6 +12023,7 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[3:4] +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v4, 1 @@ -12007,14 +12038,17 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[3:4] +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf @@ -12039,6 +12073,7 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_writelane_b32 v4, s9, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc @@ -12678,6 +12713,7 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32 @@ -12956,6 +12992,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] +; GFX1164-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc ; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 @@ -13791,16 +13828,19 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6] +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v6, -2 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[3:4] +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v4, -2 @@ -13808,8 +13848,9 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6] +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 @@ -13819,6 +13860,7 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[3:4] +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v4, -2 @@ -13833,14 +13875,17 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v5, -2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[3:4] +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, -1 ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf @@ -13865,6 +13910,7 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_writelane_b32 v4, s9, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc @@ -14504,6 +14550,7 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32 @@ -14778,6 +14825,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] +; GFX1164-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc ; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 @@ -15610,16 +15658,19 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6] +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4] +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -15627,8 +15678,9 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6] +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 @@ -15638,6 +15690,7 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4] +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -15652,14 +15705,17 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4] +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf @@ -15684,6 +15740,7 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_writelane_b32 v4, s9, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc @@ -16317,6 +16374,7 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32 @@ -16592,6 +16650,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] +; GFX1164-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc ; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 @@ -17422,16 +17481,19 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6] +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, -1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[3:4] +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, -1 @@ -17439,8 +17501,9 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6] +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 @@ -17450,6 +17513,7 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[3:4] +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, -1 @@ -17464,14 +17528,17 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[3:4] +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, -1 ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf @@ -17496,6 +17563,7 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_writelane_b32 v4, s9, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd ; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll index 0f593045007fa..62f09deeb9b08 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll @@ -515,6 +515,7 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GFX1164-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-NEXT: v_readlane_b32 s12, v1, 63 ; GFX1164-NEXT: v_readlane_b32 s14, v1, 47 ; GFX1164-NEXT: v_writelane_b32 v3, s13, 32 diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll index 5959f76492f3c..f6149cf9caf7c 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll @@ -1387,7 +1387,7 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_add_u32 s0, s2, s0 ; GFX11-NEXT: s_addc_u32 s1, s3, s1 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll index b7ee9f70f6014..65832f8db2b00 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll @@ -519,6 +519,7 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] +; GFX11-NEXT: s_waitcnt_depctr 0xfffd ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] @@ -649,6 +650,7 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] +; GFX11-NEXT: s_waitcnt_depctr 0xfffd ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] @@ -771,6 +773,7 @@ define amdgpu_ps void @global_max_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] +; GFX11-NEXT: s_waitcnt_depctr 0xfffd ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: v_mov_b32_e32 v5, v3 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] @@ -885,6 +888,7 @@ define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] +; GFX11-NEXT: s_waitcnt_depctr 0xfffd ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: v_mov_b32_e32 v5, v3 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] @@ -1443,6 +1447,7 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] +; GFX11-NEXT: s_waitcnt_depctr 0xfffd ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] @@ -1573,6 +1578,7 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] +; GFX11-NEXT: s_waitcnt_depctr 0xfffd ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] @@ -1695,6 +1701,7 @@ define amdgpu_ps void @global_min_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] +; GFX11-NEXT: s_waitcnt_depctr 0xfffd ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: v_mov_b32_e32 v5, v3 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] @@ -1809,6 +1816,7 @@ define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] +; GFX11-NEXT: s_waitcnt_depctr 0xfffd ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: v_mov_b32_e32 v5, v3 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] @@ -2367,6 +2375,7 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(ptr addrspace(1) inreg % ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] +; GFX11-NEXT: s_waitcnt_depctr 0xfffd ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] @@ -2497,6 +2506,7 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(ptr addrspace(1) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] +; GFX11-NEXT: s_waitcnt_depctr 0xfffd ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] @@ -2619,6 +2629,7 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn(ptr addrspace(1) inreg %sbase ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] +; GFX11-NEXT: s_waitcnt_depctr 0xfffd ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: v_mov_b32_e32 v5, v3 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] @@ -2733,6 +2744,7 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] +; GFX11-NEXT: s_waitcnt_depctr 0xfffd ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: v_mov_b32_e32 v5, v3 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] @@ -3291,6 +3303,7 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(ptr addrspace(1) inreg % ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] +; GFX11-NEXT: s_waitcnt_depctr 0xfffd ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] @@ -3421,6 +3434,7 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(ptr addrspace(1) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] +; GFX11-NEXT: s_waitcnt_depctr 0xfffd ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] @@ -3543,6 +3557,7 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn(ptr addrspace(1) inreg %sbase ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] +; GFX11-NEXT: s_waitcnt_depctr 0xfffd ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: v_mov_b32_e32 v5, v3 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] @@ -3657,6 +3672,7 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] +; GFX11-NEXT: s_waitcnt_depctr 0xfffd ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: v_mov_b32_e32 v5, v3 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll index 6351bb39e97f5..f6619d5954a86 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll @@ -969,14 +969,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB1_2 ; GFX1164-DPP-NEXT: ; %bb.1: @@ -2008,14 +2008,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_2 ; GFX1164-DPP-NEXT: ; %bb.1: @@ -3047,14 +3047,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_2 ; GFX1164-DPP-NEXT: ; %bb.1: @@ -4271,10 +4271,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 @@ -5539,10 +5540,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 @@ -6807,10 +6809,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll index a9ac00863cd17..2fcb9a1626fed 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll @@ -969,14 +969,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB1_2 ; GFX1164-DPP-NEXT: ; %bb.1: @@ -2008,14 +2008,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_2 ; GFX1164-DPP-NEXT: ; %bb.1: @@ -3047,14 +3047,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_2 ; GFX1164-DPP-NEXT: ; %bb.1: @@ -4271,10 +4271,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 @@ -5539,10 +5540,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 @@ -6807,10 +6809,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll index 0a2e7afa3d417..8103140f2a587 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll @@ -1185,6 +1185,7 @@ define amdgpu_ps void @fcmp_x2(float %a) #0 { ; GFX11-NEXT: v_cmp_lt_f32_e32 vcc, 0x3e800000, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc ; GFX11-NEXT: v_cmp_nle_f32_e32 vcc, 0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfffd ; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc ; GFX11-NEXT: s_cbranch_scc0 .LBB21_1 ; GFX11-NEXT: s_endpgm @@ -1595,6 +1596,7 @@ define amdgpu_ps void @kill_with_loop_exit(float inreg %inp0, float inreg %inp1, ; GFX11-NEXT: s_mov_b64 s[2:3], exec ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX11-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v1 +; GFX11-NEXT: s_waitcnt_depctr 0xf1ff ; GFX11-NEXT: .LBB26_2: ; %bb ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_add_f32_e32 v0, 0x3e800000, v0 diff --git a/llvm/test/CodeGen/AMDGPU/nor-divergent-lanemask.ll b/llvm/test/CodeGen/AMDGPU/nor-divergent-lanemask.ll index 420539346b400..97c2bdceb31a8 100644 --- a/llvm/test/CodeGen/AMDGPU/nor-divergent-lanemask.ll +++ b/llvm/test/CodeGen/AMDGPU/nor-divergent-lanemask.ll @@ -13,6 +13,7 @@ define amdgpu_ps i64 @test_nor(i64 inreg %a, i64 inreg %b) { ; SDAG-W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; SDAG-W64-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; SDAG-W64-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0 +; SDAG-W64-NEXT: s_waitcnt_depctr 0xf1ff ; SDAG-W64-NEXT: ; return to shader part epilog ; ; GISEL-W64-LABEL: test_nor: @@ -57,8 +58,8 @@ define amdgpu_ps i64 @test_or_two_uses(i64 inreg %a, i64 inreg %b) { ; SDAG-W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SDAG-W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; SDAG-W64-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1 +; SDAG-W64-NEXT: s_waitcnt_depctr 0xf1ff ; SDAG-W64-NEXT: s_and_b64 s[0:1], s[0:1], vcc -; SDAG-W64-NEXT: s_waitcnt_depctr 0xfffe ; SDAG-W64-NEXT: ; return to shader part epilog ; ; GISEL-W64-LABEL: test_or_two_uses: diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll index b21c781f6223a..5461532184fc5 100644 --- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll +++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -1239,6 +1239,7 @@ define amdgpu_ps void @phi_use_def_before_kill(float inreg %x, i32 inreg %y) #0 ; GFX11-NEXT: v_cmp_lt_f32_e32 vcc, 0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc ; GFX11-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v1 +; GFX11-NEXT: s_waitcnt_depctr 0xfffd ; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc ; GFX11-NEXT: s_cbranch_scc0 .LBB11_6 ; GFX11-NEXT: ; %bb.1: ; %bb @@ -2066,6 +2067,7 @@ define amdgpu_ps void @scc_use_after_kill_inst(float inreg %x, i32 inreg %y) #0 ; GFX11-NEXT: v_cmp_lt_f32_e32 vcc, 0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc ; GFX11-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v1 +; GFX11-NEXT: s_waitcnt_depctr 0xfffd ; GFX11-NEXT: s_and_not1_b64 s[2:3], s[2:3], vcc ; GFX11-NEXT: s_cbranch_scc0 .LBB17_6 ; GFX11-NEXT: ; %bb.1: ; %bb diff --git a/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir b/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir index 1eabe62e7710e..e1d3ebc2d35d1 100644 --- a/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir +++ b/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir @@ -38,6 +38,22 @@ define amdgpu_gs void @mask_hazard_no_hazard1() { ret void } define amdgpu_gs void @mask_hazard_no_hazard2() { ret void } define amdgpu_gs void @mask_hazard_no_hazard3() { ret void } + define amdgpu_gs void @mask_hazard_cancel_hazard1() { ret void } + define amdgpu_gs void @mask_hazard_cancel_hazard2() { ret void } + define amdgpu_gs void @mask_hazard_cancel_hazard3() { ret void } + define amdgpu_gs void @mask_hazard_cancel_hazard4() { ret void } + define amdgpu_gs void @mask_hazard_partial_cancel1() { ret void } + define amdgpu_gs void @mask_hazard_partial_cancel2() { ret void } + define amdgpu_gs void @mask_hazard_partial_cancel3() { ret void } + define amdgpu_gs void @mask_hazard_partial_cancel4() { ret void } + define amdgpu_gs void @mask_hazard_valu_readlane1() { ret void } + define amdgpu_gs void @mask_hazard_valu_readlane2() { ret void } + define amdgpu_gs void @mask_hazard_valu_readlane3() { ret void } + define amdgpu_gs void @mask_hazard_valu_readfirstlane() { ret void } + define amdgpu_gs void @mask_hazard_valu_vcmp_vcc() { ret void } + define amdgpu_gs void @mask_hazard_valu_vcmp_sgpr() { ret void } + define amdgpu_gs void @mask_hazard_combine1() { ret void } + define amdgpu_gs void @mask_hazard_combine2() { ret void } ... --- @@ -487,8 +503,8 @@ body: | ; GFX11-LABEL: name: mask_hazard_subreg3 ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec ; GFX11-NEXT: $sgpr2 = S_MOV_B32 0 - ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534 ; GFX11-NEXT: $sgpr3 = S_MOV_B32 0 + ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534 ; GFX11-NEXT: S_ENDPGM 0 ; ; GFX12-LABEL: name: mask_hazard_subreg3 @@ -655,3 +671,373 @@ body: | $vgpr2 = V_MOV_B32_e32 0, implicit $exec S_ENDPGM 0 ... + +--- +name: mask_hazard_cancel_hazard1 +body: | + bb.0: + ; GCN-LABEL: name: mask_hazard_cancel_hazard1 + ; GCN: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + ; GCN-NEXT: $vcc_lo = S_MOV_B32 0 + ; GCN-NEXT: $vcc_hi = S_MOV_B32 0 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr0 = S_MOV_B32 $vcc_lo + ; GCN-NEXT: $vcc = S_MOV_B64 1 + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + $vcc_lo = S_MOV_B32 0 + $vcc_hi = S_MOV_B32 0 + $sgpr0 = S_MOV_B32 $vcc_lo + $vcc = S_MOV_B64 1 + S_ENDPGM 0 +... + +--- +name: mask_hazard_cancel_hazard2 +body: | + bb.0: + ; GCN-LABEL: name: mask_hazard_cancel_hazard2 + ; GCN: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + ; GCN-NEXT: $vcc = S_MOV_B64 0 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr0 = S_MOV_B32 $vcc_lo + ; GCN-NEXT: $vcc = S_MOV_B64 1 + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + $vcc = S_MOV_B64 0 + $sgpr0 = S_MOV_B32 $vcc_lo + $vcc = S_MOV_B64 1 + S_ENDPGM 0 +... + +--- +name: mask_hazard_cancel_hazard3 +body: | + bb.0: + ; GCN-LABEL: name: mask_hazard_cancel_hazard3 + ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: $sgpr0_sgpr1 = S_MOV_B64 0 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr4 = S_MOV_B32 $sgpr0 + ; GCN-NEXT: $sgpr0_sgpr1 = S_MOV_B64 1 + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + $sgpr0_sgpr1 = S_MOV_B64 0 + $sgpr4 = S_MOV_B32 $sgpr0 + $sgpr0_sgpr1 = S_MOV_B64 1 + S_ENDPGM 0 +... + +--- +name: mask_hazard_cancel_hazard4 +body: | + bb.0: + ; GCN-LABEL: name: mask_hazard_cancel_hazard4 + ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: $sgpr0 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr1 = S_MOV_B32 0 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr4 = S_MOV_B32 $sgpr0 + ; GCN-NEXT: $sgpr0_sgpr1 = S_MOV_B64 1 + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + $sgpr0 = S_MOV_B32 0 + $sgpr1 = S_MOV_B32 0 + $sgpr4 = S_MOV_B32 $sgpr0 + $sgpr0_sgpr1 = S_MOV_B64 1 + S_ENDPGM 0 +... + +--- +name: mask_hazard_partial_cancel1 +body: | + bb.0: + ; GFX11-LABEL: name: mask_hazard_partial_cancel1 + ; GFX11: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + ; GFX11-NEXT: $vcc_lo = S_MOV_B32 0 + ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534 + ; GFX11-NEXT: $sgpr0 = S_MOV_B32 $vcc_lo + ; GFX11-NEXT: $vcc = S_MOV_B64 1 + ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534 + ; GFX11-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: mask_hazard_partial_cancel1 + ; GFX12: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + ; GFX12-NEXT: $vcc_lo = S_MOV_B32 0 + ; GFX12-NEXT: S_WAITCNT_DEPCTR 65534 + ; GFX12-NEXT: $sgpr0 = S_MOV_B32 $vcc_lo + ; GFX12-NEXT: $vcc = S_MOV_B64 1 + ; GFX12-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + $vcc_lo = S_MOV_B32 0 + $sgpr0 = S_MOV_B32 $vcc_lo + $vcc = S_MOV_B64 1 + S_ENDPGM 0 +... + +--- +name: mask_hazard_partial_cancel2 +body: | + bb.0: + ; GFX11-LABEL: name: mask_hazard_partial_cancel2 + ; GFX11: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + ; GFX11-NEXT: $vcc_hi = S_MOV_B32 0 + ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534 + ; GFX11-NEXT: $sgpr0 = S_MOV_B32 $vcc_lo + ; GFX11-NEXT: $vcc = S_MOV_B64 1 + ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534 + ; GFX11-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: mask_hazard_partial_cancel2 + ; GFX12: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + ; GFX12-NEXT: $vcc_hi = S_MOV_B32 0 + ; GFX12-NEXT: S_WAITCNT_DEPCTR 65534 + ; GFX12-NEXT: $sgpr0 = S_MOV_B32 $vcc_lo + ; GFX12-NEXT: $vcc = S_MOV_B64 1 + ; GFX12-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + $vcc_hi = S_MOV_B32 0 + $sgpr0 = S_MOV_B32 $vcc_lo + $vcc = S_MOV_B64 1 + S_ENDPGM 0 +... + +--- +name: mask_hazard_partial_cancel3 +body: | + bb.0: + ; GFX11-LABEL: name: mask_hazard_partial_cancel3 + ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + ; GFX11-NEXT: $sgpr0 = S_MOV_B32 0 + ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534 + ; GFX11-NEXT: $sgpr3 = S_MOV_B32 $sgpr0 + ; GFX11-NEXT: $sgpr0_sgpr1 = S_MOV_B64 1 + ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534 + ; GFX11-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: mask_hazard_partial_cancel3 + ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + ; GFX12-NEXT: $sgpr0 = S_MOV_B32 0 + ; GFX12-NEXT: S_WAITCNT_DEPCTR 65534 + ; GFX12-NEXT: $sgpr3 = S_MOV_B32 $sgpr0 + ; GFX12-NEXT: $sgpr0_sgpr1 = S_MOV_B64 1 + ; GFX12-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + $sgpr0 = S_MOV_B32 0 + $sgpr3 = S_MOV_B32 $sgpr0 + $sgpr0_sgpr1 = S_MOV_B64 1 + S_ENDPGM 0 +... + +--- +name: mask_hazard_partial_cancel4 +body: | + bb.0: + ; GFX11-LABEL: name: mask_hazard_partial_cancel4 + ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + ; GFX11-NEXT: $sgpr1 = S_MOV_B32 0 + ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534 + ; GFX11-NEXT: $sgpr3 = S_MOV_B32 $sgpr1 + ; GFX11-NEXT: $sgpr0_sgpr1 = S_MOV_B64 1 + ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534 + ; GFX11-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: mask_hazard_partial_cancel4 + ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + ; GFX12-NEXT: $sgpr1 = S_MOV_B32 0 + ; GFX12-NEXT: S_WAITCNT_DEPCTR 65534 + ; GFX12-NEXT: $sgpr3 = S_MOV_B32 $sgpr1 + ; GFX12-NEXT: $sgpr0_sgpr1 = S_MOV_B64 1 + ; GFX12-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + $sgpr1 = S_MOV_B32 0 + $sgpr3 = S_MOV_B32 $sgpr1 + $sgpr0_sgpr1 = S_MOV_B64 1 + S_ENDPGM 0 +... + +--- +name: mask_hazard_valu_readlane1 +body: | + bb.0: + ; GFX11-LABEL: name: mask_hazard_valu_readlane1 + ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + ; GFX11-NEXT: $sgpr2 = V_READLANE_B32 $vgpr3, 0 + ; GFX11-NEXT: S_WAITCNT_DEPCTR 61951 + ; GFX11-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: mask_hazard_valu_readlane1 + ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + ; GFX12-NEXT: $sgpr2 = V_READLANE_B32 $vgpr3, 0 + ; GFX12-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + $sgpr2 = V_READLANE_B32 $vgpr3, 0 + S_ENDPGM 0 +... + +--- +name: mask_hazard_valu_readlane2 +body: | + bb.0: + ; GFX11-LABEL: name: mask_hazard_valu_readlane2 + ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + ; GFX11-NEXT: $sgpr3 = V_READLANE_B32 $vgpr3, 1 + ; GFX11-NEXT: S_WAITCNT_DEPCTR 61951 + ; GFX11-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: mask_hazard_valu_readlane2 + ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + ; GFX12-NEXT: $sgpr3 = V_READLANE_B32 $vgpr3, 1 + ; GFX12-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + $sgpr3 = V_READLANE_B32 $vgpr3, 1 + S_ENDPGM 0 +... + +--- +name: mask_hazard_valu_readlane3 +body: | + bb.0: + ; GFX11-LABEL: name: mask_hazard_valu_readlane3 + ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + ; GFX11-NEXT: $sgpr2 = V_READLANE_B32 $vgpr3, 0 + ; GFX11-NEXT: $sgpr3 = V_READLANE_B32 $vgpr3, 1 + ; GFX11-NEXT: S_WAITCNT_DEPCTR 61951 + ; GFX11-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: mask_hazard_valu_readlane3 + ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + ; GFX12-NEXT: $sgpr2 = V_READLANE_B32 $vgpr3, 0 + ; GFX12-NEXT: $sgpr3 = V_READLANE_B32 $vgpr3, 1 + ; GFX12-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + $sgpr2 = V_READLANE_B32 $vgpr3, 0 + $sgpr3 = V_READLANE_B32 $vgpr3, 1 + S_ENDPGM 0 +... + +--- +name: mask_hazard_valu_readfirstlane +body: | + bb.0: + ; GFX11-LABEL: name: mask_hazard_valu_readfirstlane + ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + ; GFX11-NEXT: $sgpr2 = V_READFIRSTLANE_B32 $vgpr3, implicit $exec + ; GFX11-NEXT: S_WAITCNT_DEPCTR 61951 + ; GFX11-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: mask_hazard_valu_readfirstlane + ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + ; GFX12-NEXT: $sgpr2 = V_READFIRSTLANE_B32 $vgpr3, implicit $exec + ; GFX12-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + $sgpr2 = V_READFIRSTLANE_B32 $vgpr3, implicit $exec + S_ENDPGM 0 +... + +--- +name: mask_hazard_valu_vcmp_vcc +body: | + bb.0: + ; GFX11-LABEL: name: mask_hazard_valu_vcmp_vcc + ; GFX11: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + ; GFX11-NEXT: V_CMP_NE_U32_e32 0, $vgpr5, implicit-def $vcc, implicit $exec + ; GFX11-NEXT: S_WAITCNT_DEPCTR 65533 + ; GFX11-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: mask_hazard_valu_vcmp_vcc + ; GFX12: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + ; GFX12-NEXT: V_CMP_NE_U32_e32 0, $vgpr5, implicit-def $vcc, implicit $exec + ; GFX12-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + V_CMP_NE_U32_e32 0, $vgpr5, implicit-def $vcc, implicit $exec + S_ENDPGM 0 +... + +--- +name: mask_hazard_valu_vcmp_sgpr +body: | + bb.0: + ; GFX11-LABEL: name: mask_hazard_valu_vcmp_sgpr + ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + ; GFX11-NEXT: $sgpr2_sgpr3 = V_CMP_EQ_U32_e64 3, $vgpr5, implicit $exec + ; GFX11-NEXT: S_WAITCNT_DEPCTR 61951 + ; GFX11-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: mask_hazard_valu_vcmp_sgpr + ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + ; GFX12-NEXT: $sgpr2_sgpr3 = V_CMP_EQ_U32_e64 3, $vgpr5, implicit $exec + ; GFX12-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + $sgpr2_sgpr3 = V_CMP_EQ_U32_e64 3, $vgpr5, implicit $exec + S_ENDPGM 0 +... + +--- +name: mask_hazard_combine1 +body: | + bb.0: + ; GFX11-LABEL: name: mask_hazard_combine1 + ; GFX11: $vgpr3 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + ; GFX11-NEXT: $vgpr4 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + ; GFX11-NEXT: $vgpr5 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + ; GFX11-NEXT: V_CMP_NE_U32_e32 0, $vgpr5, implicit-def $vcc, implicit $exec + ; GFX11-NEXT: $sgpr0 = S_MOV_B32 0 + ; GFX11-NEXT: $sgpr1 = S_MOV_B32 0 + ; GFX11-NEXT: $sgpr2_sgpr3 = V_CMP_EQ_U32_e64 3, $vgpr5, implicit $exec + ; GFX11-NEXT: S_WAITCNT_DEPCTR 61948 + ; GFX11-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: mask_hazard_combine1 + ; GFX12: $vgpr3 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + ; GFX12-NEXT: $vgpr4 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + ; GFX12-NEXT: $vgpr5 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + ; GFX12-NEXT: V_CMP_NE_U32_e32 0, $vgpr5, implicit-def $vcc, implicit $exec + ; GFX12-NEXT: $sgpr0 = S_MOV_B32 0 + ; GFX12-NEXT: $sgpr1 = S_MOV_B32 0 + ; GFX12-NEXT: $sgpr2_sgpr3 = V_CMP_EQ_U32_e64 3, $vgpr5, implicit $exec + ; GFX12-NEXT: S_ENDPGM 0 + $vgpr3 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + $vgpr4 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + $vgpr5 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + V_CMP_NE_U32_e32 0, $vgpr5, implicit-def $vcc, implicit $exec + $sgpr0 = S_MOV_B32 0 + $sgpr1 = S_MOV_B32 0 + $sgpr2_sgpr3 = V_CMP_EQ_U32_e64 3, $vgpr5, implicit $exec + S_ENDPGM 0 +... + +--- +name: mask_hazard_combine2 +body: | + bb.0: + ; GFX11-LABEL: name: mask_hazard_combine2 + ; GFX11: $vgpr3 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + ; GFX11-NEXT: $vgpr4 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + ; GFX11-NEXT: $vgpr5 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + ; GFX11-NEXT: V_CMP_NE_U32_e32 0, $vgpr5, implicit-def $vcc, implicit $exec + ; GFX11-NEXT: $sgpr0 = S_MOV_B32 0 + ; GFX11-NEXT: S_WAITCNT_DEPCTR 65532 + ; GFX11-NEXT: $sgpr1 = S_MOV_B32 $sgpr4 + ; GFX11-NEXT: $sgpr2_sgpr3 = V_CMP_EQ_U32_e64 3, $vgpr5, implicit $exec + ; GFX11-NEXT: S_WAITCNT_DEPCTR 61950 + ; GFX11-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: mask_hazard_combine2 + ; GFX12: $vgpr3 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + ; GFX12-NEXT: $vgpr4 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + ; GFX12-NEXT: $vgpr5 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + ; GFX12-NEXT: V_CMP_NE_U32_e32 0, $vgpr5, implicit-def $vcc, implicit $exec + ; GFX12-NEXT: $sgpr0 = S_MOV_B32 0 + ; GFX12-NEXT: $sgpr1 = S_MOV_B32 $sgpr4 + ; GFX12-NEXT: $sgpr2_sgpr3 = V_CMP_EQ_U32_e64 3, $vgpr5, implicit $exec + ; GFX12-NEXT: S_ENDPGM 0 + $vgpr3 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + $vgpr4 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + $vgpr5 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + V_CMP_NE_U32_e32 0, $vgpr5, implicit-def $vcc, implicit $exec + $sgpr0 = S_MOV_B32 0 + $sgpr1 = S_MOV_B32 $sgpr4 + $sgpr2_sgpr3 = V_CMP_EQ_U32_e64 3, $vgpr5, implicit $exec + S_ENDPGM 0 +... From e0a2ea92784408f3f7fcdbbb7f6d773959b91e6d Mon Sep 17 00:00:00 2001 From: Carl Ritson Date: Sun, 28 Sep 2025 12:01:43 +0900 Subject: [PATCH 2/3] - Address reviewer feedback --- llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 2595fb33eb432..d1459baefe933 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -3354,9 +3354,6 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) { const Register HazardReg = HazardDef->getReg(); if (AMDGPU::SReg_32RegClass.contains(HazardReg)) { InitialState.HazardSGPRs.insert(HazardReg); - } else if (IsVCC(HazardReg)) { - InitialState.HazardSGPRs.insert(AMDGPU::VCC_LO); - InitialState.HazardSGPRs.insert(AMDGPU::VCC_HI); } else { assert(AMDGPU::SReg_64RegClass.contains(HazardReg)); InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub0)); From e20e4fda0b04f7a32a72873ccd9cc2e63de894d7 Mon Sep 17 00:00:00 2001 From: Carl Ritson Date: Mon, 13 Oct 2025 14:14:40 +0900 Subject: [PATCH 3/3] - Avoid const_cast --- .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 40 +++++++++++-------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index d1459baefe933..52cc4ca5a955c 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -3405,11 +3405,15 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) { } }; + const unsigned ConstantMaskBits = AMDGPU::DepCtr::encodeFieldSaSdst( + AMDGPU::DepCtr::encodeFieldVaSdst(AMDGPU::DepCtr::encodeFieldVaVcc(0), 0), + 0); auto UpdateStateFn = [&](StateType &State, const MachineInstr &I) { switch (I.getOpcode()) { case AMDGPU::S_WAITCNT_DEPCTR: - // Record waits within region of instructions free of SGPR reads. - if (!HasSGPRRead && I.getParent() == MI->getParent()) + // Record mergable waits within region of instructions free of SGPR reads. + if (!HasSGPRRead && I.getParent() == MI->getParent() && !I.isBundled() && + (I.getOperand(0).getImm() & ConstantMaskBits) == ConstantMaskBits) WaitInstrs.push_back(&I); break; default: @@ -3459,21 +3463,22 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) { : AMDGPU::DepCtr::encodeFieldSaSdst(0); // Try to merge previous waits into this one for regions with no SGPR reads. - if (WaitInstrs.size()) { - const unsigned ConstantBits = AMDGPU::DepCtr::encodeFieldSaSdst( - AMDGPU::DepCtr::encodeFieldVaSdst(AMDGPU::DepCtr::encodeFieldVaVcc(0), - 0), - 0); - - for (const MachineInstr *Instr : WaitInstrs) { - // Don't touch bundled waits. - if (Instr->isBundled()) + if (!WaitInstrs.empty()) { + // Note: WaitInstrs contains const pointers, so walk backward from MI to + // obtain a mutable pointer to each instruction to be merged. + // This is expected to be a very short walk within the same block. + SmallVector ToErase; + unsigned Found = 0; + for (MachineBasicBlock::reverse_iterator It = MI->getReverseIterator(), + End = MI->getParent()->rend(); + Found < WaitInstrs.size() && It != End; ++It) { + MachineInstr *WaitMI = &*It; + // Find next wait instruction. + if (std::as_const(WaitMI) != WaitInstrs[Found]) continue; - MachineInstr *WaitMI = const_cast(Instr); + Found++; unsigned WaitMask = WaitMI->getOperand(0).getImm(); - // Only work with counters related to this hazard. - if ((WaitMask & ConstantBits) != ConstantBits) - continue; + assert((WaitMask & ConstantMaskBits) == ConstantMaskBits); DepCtr = AMDGPU::DepCtr::encodeFieldSaSdst( DepCtr, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(WaitMask), AMDGPU::DepCtr::decodeFieldSaSdst(DepCtr))); @@ -3483,8 +3488,11 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) { DepCtr = AMDGPU::DepCtr::encodeFieldVaVcc( DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaVcc(WaitMask), AMDGPU::DepCtr::decodeFieldVaVcc(DepCtr))); - WaitMI->eraseFromParent(); + ToErase.push_back(WaitMI); } + assert(Found == WaitInstrs.size()); + for (MachineInstr *WaitMI : ToErase) + WaitMI->eraseFromParent(); } // Add s_waitcnt_depctr after SGPR write.