diff --git a/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu b/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu index 55ddb52da311e..8bf8241e343e7 100644 --- a/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu +++ b/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu @@ -87,8 +87,8 @@ __global__ void ffp2(double *p) { // UNSAFE-LABEL: @_Z4ffp2Pd // UNSAFE: global_atomic_add_f64 // UNSAFE: global_atomic_cmpswap_x2 - // UNSAFE: global_atomic_cmpswap_x2 - // UNSAFE: global_atomic_cmpswap_x2 + // UNSAFE: global_atomic_max_f64 + // UNSAFE: global_atomic_min_f64 // UNSAFE: global_atomic_max_f64 // UNSAFE: global_atomic_min_f64 __atomic_fetch_add(p, 1.0, memory_order_relaxed); @@ -124,8 +124,8 @@ __global__ void ffp3(long double *p) { // SAFE: global_atomic_cmpswap_b64 // UNSAFE-LABEL: @_Z4ffp3Pe // UNSAFE: global_atomic_cmpswap_x2 - // UNSAFE: global_atomic_cmpswap_x2 - // UNSAFE: global_atomic_cmpswap_x2 + // UNSAFE: global_atomic_max_f64 + // UNSAFE: global_atomic_min_f64 // UNSAFE: global_atomic_max_f64 // UNSAFE: global_atomic_min_f64 __atomic_fetch_add(p, 1.0L, memory_order_relaxed); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index d5d7a2522a09b..e70a420ab566c 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -16118,6 +16118,34 @@ static bool isBFloat2(Type *Ty) { return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy(); } +/// \returns true if it's valid to emit a native instruction for \p RMW, based +/// on the properties of the target memory. +static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, + const AtomicRMWInst *RMW, + bool HasSystemScope) { + // The remote/fine-grained access logic is different from the integer + // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support, + // fine-grained access does not work, even for a device local allocation. + // + // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local + // allocations work. + if (HasSystemScope) { + if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics() && + RMW->hasMetadata("amdgpu.no.remote.memory")) + return true; + } else if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics()) + return true; + + if (RMW->hasMetadata("amdgpu.no.fine.grained.memory")) + return true; + + // TODO: Auto-upgrade this attribute to the metadata in function body and stop + // checking it. + return RMW->getFunction() + ->getFnAttribute("amdgpu-unsafe-fp-atomics") + .getValueAsBool(); +} + TargetLowering::AtomicExpansionKind SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { unsigned AS = RMW->getPointerAddressSpace(); @@ -16268,37 +16296,32 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { Type *Ty = RMW->getType(); // LDS float and double fmin/fmax were always supported. - if (AS == AMDGPUAS::LOCAL_ADDRESS && (Ty->isFloatTy() || Ty->isDoubleTy())) - return AtomicExpansionKind::None; - - if (unsafeFPAtomicsDisabled(RMW->getFunction())) - return AtomicExpansionKind::CmpXChg; - - // Always expand system scope fp atomics. - if (HasSystemScope) - return AtomicExpansionKind::CmpXChg; + if (AS == AMDGPUAS::LOCAL_ADDRESS) { + return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None + : AtomicExpansionKind::CmpXChg; + } - // For flat and global cases: - // float, double in gfx7. Manual claims denormal support. - // Removed in gfx8. - // float, double restored in gfx10. - // double removed again in gfx11, so only f32 for gfx11/gfx12. - // - // For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but no - // f32. - // - // FIXME: Check scope and fine grained memory - if (AS == AMDGPUAS::FLAT_ADDRESS) { - if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy()) - return ReportUnsafeHWInst(AtomicExpansionKind::None); - if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy()) - return ReportUnsafeHWInst(AtomicExpansionKind::None); - } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) || - AS == AMDGPUAS::BUFFER_FAT_POINTER) { - if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy()) - return ReportUnsafeHWInst(AtomicExpansionKind::None); - if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy()) - return ReportUnsafeHWInst(AtomicExpansionKind::None); + if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) { + // For flat and global cases: + // float, double in gfx7. Manual claims denormal support. + // Removed in gfx8. + // float, double restored in gfx10. + // double removed again in gfx11, so only f32 for gfx11/gfx12. + // + // For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but + // no f32. + if (AS == AMDGPUAS::FLAT_ADDRESS) { + if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy()) + return ReportUnsafeHWInst(AtomicExpansionKind::None); + if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy()) + return ReportUnsafeHWInst(AtomicExpansionKind::None); + } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) || + AS == AMDGPUAS::BUFFER_FAT_POINTER) { + if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy()) + return ReportUnsafeHWInst(AtomicExpansionKind::None); + if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy()) + return ReportUnsafeHWInst(AtomicExpansionKind::None); + } } return AtomicExpansionKind::CmpXChg; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll index 1270e69998e6c..61d2c854dffa5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll @@ -318,29 +318,11 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4 -; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB4_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: @@ -371,55 +353,21 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX11-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: global_atomic_max_f32 v0, v[0:1], v2, off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB4_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX10-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB4_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: @@ -500,27 +448,10 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] -; GFX7-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v5 -; GFX7-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB4_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result @@ -534,28 +465,11 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB5_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: @@ -585,53 +499,21 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_max_f32 v[0:1], v2, off +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB5_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX10-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_atomic_fmax v[0:1], v2, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB5_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: @@ -709,26 +591,9 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] -; GFX7-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX7-NEXT: v_mov_b32_e32 v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v5, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB5_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -770,27 +635,10 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX940-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX940-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX940-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off sc0 +; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB6_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: @@ -824,54 +672,19 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v7, v5 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 -; GFX10-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX10-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB6_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX90A-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: @@ -929,34 +742,14 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: v_mov_b32_e32 v5, v1 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 -; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] -; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v11, v1 -; GFX7-NEXT: v_mov_b32_e32 v10, v0 -; GFX7-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11] -; GFX7-NEXT: v_max_f64 v[8:9], v[0:1], v[6:7] -; GFX7-NEXT: v_mov_b32_e32 v0, v8 -; GFX7-NEXT: v_mov_b32_e32 v1, v9 -; GFX7-NEXT: v_mov_b32_e32 v2, v10 -; GFX7-NEXT: v_mov_b32_e32 v3, v11 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc +; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB6_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret double %result @@ -997,25 +790,10 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] -; GFX940-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off sc0 +; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB7_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: @@ -1048,50 +826,19 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX10-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX10-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[2:3], off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX10-NEXT: v_mov_b32_e32 v5, v3 -; GFX10-NEXT: v_mov_b32_e32 v4, v2 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB7_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] -; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc +; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: @@ -1148,29 +895,9 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] -; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX7-NEXT: v_mov_b32_e32 v11, v5 -; GFX7-NEXT: v_mov_b32_e32 v10, v4 -; GFX7-NEXT: v_mov_b32_e32 v9, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v2 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v4, v8 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v5, v9 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB7_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -1184,29 +911,11 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4 -; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB8_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: @@ -1237,55 +946,21 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX11-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-NEXT: flat_atomic_max_f32 v0, v[0:1], v2 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB8_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX10-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: flat_atomic_fmax v0, v[0:1], v2 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB8_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: @@ -1363,25 +1038,9 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX7-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v4 -; GFX7-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX7-NEXT: flat_atomic_fmax v0, v[0:1], v2 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB8_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmax ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result @@ -1395,28 +1054,11 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB9_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: @@ -1446,53 +1088,23 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_atomic_max_f32 v[0:1], v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB9_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX10-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: flat_atomic_fmax v[0:1], v2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB9_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: @@ -1567,24 +1179,9 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX7-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: flat_atomic_fmax v[0:1], v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v3, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB9_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fmax ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -1626,27 +1223,10 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX940-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX940-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 +; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB10_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: @@ -1680,54 +1260,19 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v7, v5 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 -; GFX10-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX10-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX10-NEXT: flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB10_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX90A-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: @@ -1788,30 +1333,9 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX7-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0 -; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v4, v[0:1] -; GFX7-NEXT: flat_load_dword v5, v[5:6] -; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX7-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX7-NEXT: flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB10_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v4 -; GFX7-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret double %result @@ -1852,25 +1376,10 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] -; GFX940-NEXT: .LBB11_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0 +; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB11_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: @@ -1903,50 +1412,20 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX10-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX10-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: flat_atomic_fmax_x2 v[0:1], v[2:3] +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX10-NEXT: v_mov_b32_e32 v5, v3 -; GFX10-NEXT: v_mov_b32_e32 v4, v2 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB11_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] -; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: @@ -2003,28 +1482,9 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX7-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0 -; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v4, v[0:1] -; GFX7-NEXT: flat_load_dword v5, v[5:6] -; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX7-NEXT: flat_atomic_fmax_x2 v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB11_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -2038,31 +1498,12 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, s6 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v3, v1, v1 -; GFX12-NEXT: buffer_load_b32 v0, v2, s[0:3], null offen -; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: v_mov_b32_e32 v1, s6 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: v_max_num_f32_e32 v0, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v4, v0, v3 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB12_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: @@ -2095,65 +1536,27 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, s6 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v3, v1, v1 -; GFX11-NEXT: buffer_load_b32 v0, v2, s[0:3], 0 offen -; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_mov_b32_e32 v1, s6 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX11-NEXT: v_max_f32_e32 v4, v0, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v2, s[0:3], 0 offen glc +; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB12_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s18 +; GFX10-NEXT: v_mov_b32_e32 v1, s18 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s16 ; GFX10-NEXT: s_mov_b32 s7, s17 -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: buffer_load_dword v0, v2, s[4:7], 0 offen -; GFX10-NEXT: s_mov_b32 s8, 0 -; GFX10-NEXT: v_max_f32_e32 v3, v1, v1 -; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX10-NEXT: v_max_f32_e32 v4, v0, v3 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[4:7], 0 offen glc +; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 -; GFX10-NEXT: s_cbranch_execnz .LBB12_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: @@ -2255,28 +1658,10 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX7-NEXT: s_mov_b32 s5, s7 ; GFX7-NEXT: s_mov_b32 s6, s16 ; GFX7-NEXT: s_mov_b32 s7, s17 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: v_mov_b32_e32 v1, v0 -; GFX7-NEXT: buffer_load_dword v0, v2, s[4:7], 0 offen -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v1 -; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX7-NEXT: v_max_f32_e32 v4, v0, v3 -; GFX7-NEXT: v_mov_b32_e32 v0, v4 -; GFX7-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[4:7], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v1, s18 +; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB12_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmax ptr addrspace(7) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result @@ -2290,29 +1675,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_max_num_f32 v3, v0, v0 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen -; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f32_e32 v0, v1, v1 +; GFX12-NEXT: v_mov_b32_e32 v1, s6 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v4 -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB13_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: @@ -2344,61 +1712,27 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_max_f32 v3, v0, v0 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen -; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v4 -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB13_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s18 +; GFX10-NEXT: v_mov_b32_e32 v1, s18 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s16 ; GFX10-NEXT: s_mov_b32 s7, s17 -; GFX10-NEXT: v_max_f32_e32 v3, v0, v0 -; GFX10-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen -; GFX10-NEXT: s_mov_b32 s8, 0 -; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f32_e32 v0, v1, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 -; GFX10-NEXT: s_cbranch_execnz .LBB13_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: @@ -2497,27 +1831,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX7-NEXT: s_mov_b32 s5, s7 ; GFX7-NEXT: s_mov_b32 s6, s16 ; GFX7-NEXT: s_mov_b32 s7, s17 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v0 -; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX7-NEXT: v_mov_b32_e32 v5, v1 -; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v1, s18 +; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v1, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB13_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fmax ptr addrspace(7) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -2562,30 +1879,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v6, s6 -; GFX940-NEXT: v_mov_b32_e32 v2, v0 -; GFX940-NEXT: v_mov_b32_e32 v3, v1 -; GFX940-NEXT: buffer_load_dwordx2 v[0:1], v6, s[0:3], 0 offen -; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[10:11], v[0:1] -; GFX940-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11] -; GFX940-NEXT: v_max_f64 v[8:9], v[0:1], v[4:5] -; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[8:9] -; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[10:11] +; GFX940-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] -; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB14_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: @@ -2623,38 +1921,16 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, s18 +; GFX10-NEXT: v_mov_b32_e32 v2, s18 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s16 ; GFX10-NEXT: s_mov_b32 s7, s17 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen -; GFX10-NEXT: s_mov_b32 s8, 0 -; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v10, v1 -; GFX10-NEXT: v_mov_b32_e32 v9, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX10-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] -; GFX10-NEXT: v_mov_b32_e32 v0, v7 -; GFX10-NEXT: v_mov_b32_e32 v1, v8 -; GFX10-NEXT: v_mov_b32_e32 v2, v9 -; GFX10-NEXT: v_mov_b32_e32 v3, v10 -; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc +; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 -; GFX10-NEXT: s_cbranch_execnz .LBB14_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: @@ -2664,29 +1940,10 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX90A-NEXT: s_mov_b32 s5, s7 ; GFX90A-NEXT: s_mov_b32 s6, s16 ; GFX90A-NEXT: s_mov_b32 s7, s17 -; GFX90A-NEXT: v_mov_b32_e32 v6, s18 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen -; GFX90A-NEXT: s_mov_b64 s[8:9], 0 -; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11] -; GFX90A-NEXT: v_max_f64 v[8:9], v[0:1], v[4:5] -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[8:9] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[10:11], v[10:11] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc +; GFX90A-NEXT: v_mov_b32_e32 v2, s18 +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] -; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: @@ -2766,32 +2023,10 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX7-NEXT: s_mov_b32 s5, s7 ; GFX7-NEXT: s_mov_b32 s6, s16 ; GFX7-NEXT: s_mov_b32 s7, s17 -; GFX7-NEXT: v_mov_b32_e32 v6, s18 -; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen -; GFX7-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v10, v1 -; GFX7-NEXT: v_mov_b32_e32 v9, v0 -; GFX7-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX7-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v7 -; GFX7-NEXT: v_mov_b32_e32 v1, v8 -; GFX7-NEXT: v_mov_b32_e32 v2, v9 -; GFX7-NEXT: v_mov_b32_e32 v3, v10 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v2, s18 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB14_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmax ptr addrspace(7) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret double %result @@ -2835,28 +2070,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v6, s6 -; GFX940-NEXT: buffer_load_dwordx2 v[2:3], v6, s[0:3], 0 offen -; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX940-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX940-NEXT: v_mov_b64_e32 v[10:11], v[2:3] -; GFX940-NEXT: v_mov_b64_e32 v[8:9], v[0:1] +; GFX940-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v6, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[2:3] -; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[8:9] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB15_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: @@ -2892,36 +2110,16 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, s18 +; GFX10-NEXT: v_mov_b32_e32 v2, s18 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s16 ; GFX10-NEXT: s_mov_b32 s7, s17 -; GFX10-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX10-NEXT: buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen -; GFX10-NEXT: s_mov_b32 s8, 0 -; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX10-NEXT: v_mov_b32_e32 v10, v3 -; GFX10-NEXT: v_mov_b32_e32 v9, v2 -; GFX10-NEXT: v_mov_b32_e32 v8, v1 -; GFX10-NEXT: v_mov_b32_e32 v7, v0 -; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX10-NEXT: v_mov_b32_e32 v2, v7 -; GFX10-NEXT: v_mov_b32_e32 v3, v8 -; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 -; GFX10-NEXT: s_cbranch_execnz .LBB15_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: @@ -2931,27 +2129,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX90A-NEXT: s_mov_b32 s5, s7 ; GFX90A-NEXT: s_mov_b32 s6, s16 ; GFX90A-NEXT: s_mov_b32 s7, s17 -; GFX90A-NEXT: v_mov_b32_e32 v6, s18 -; GFX90A-NEXT: buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen -; GFX90A-NEXT: s_mov_b64 s[8:9], 0 -; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v6, s[4:7], 0 offen glc +; GFX90A-NEXT: v_mov_b32_e32 v2, s18 +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[2:3] -; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[8:9], v[8:9] op_sel:[0,1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: @@ -3027,30 +2208,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX7-NEXT: s_mov_b32 s5, s7 ; GFX7-NEXT: s_mov_b32 s6, s16 ; GFX7-NEXT: s_mov_b32 s7, s17 -; GFX7-NEXT: v_mov_b32_e32 v6, s18 -; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen -; GFX7-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v10, v3 -; GFX7-NEXT: v_mov_b32_e32 v9, v2 -; GFX7-NEXT: v_mov_b32_e32 v8, v1 -; GFX7-NEXT: v_mov_b32_e32 v7, v0 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v2, s18 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v2, v7 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v8 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB15_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fmax ptr addrspace(7) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll index e36161a936e36..83be67a9138f6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll @@ -318,29 +318,11 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4 -; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB4_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: @@ -371,55 +353,21 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX11-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: global_atomic_min_f32 v0, v[0:1], v2, off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB4_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX10-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB4_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: @@ -500,27 +448,10 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] -; GFX7-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v5 -; GFX7-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB4_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result @@ -534,28 +465,11 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB5_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: @@ -585,53 +499,21 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_min_f32 v[0:1], v2, off +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB5_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX10-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_atomic_fmin v[0:1], v2, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB5_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: @@ -709,26 +591,9 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] -; GFX7-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX7-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX7-NEXT: v_mov_b32_e32 v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v5, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB5_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -770,27 +635,10 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX940-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX940-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX940-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off sc0 +; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB6_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: @@ -824,54 +672,19 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v7, v5 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 -; GFX10-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX10-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB6_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX90A-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: @@ -929,34 +742,14 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: v_mov_b32_e32 v5, v1 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 -; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] -; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v11, v1 -; GFX7-NEXT: v_mov_b32_e32 v10, v0 -; GFX7-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11] -; GFX7-NEXT: v_min_f64 v[8:9], v[0:1], v[6:7] -; GFX7-NEXT: v_mov_b32_e32 v0, v8 -; GFX7-NEXT: v_mov_b32_e32 v1, v9 -; GFX7-NEXT: v_mov_b32_e32 v2, v10 -; GFX7-NEXT: v_mov_b32_e32 v3, v11 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc +; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB6_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret double %result @@ -997,25 +790,10 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] -; GFX940-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX940-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off sc0 +; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB7_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: @@ -1048,50 +826,19 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX10-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX10-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX10-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[2:3], off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX10-NEXT: v_mov_b32_e32 v5, v3 -; GFX10-NEXT: v_mov_b32_e32 v4, v2 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB7_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] -; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX90A-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc +; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: @@ -1148,29 +895,9 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] -; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX7-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX7-NEXT: v_mov_b32_e32 v11, v5 -; GFX7-NEXT: v_mov_b32_e32 v10, v4 -; GFX7-NEXT: v_mov_b32_e32 v9, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v2 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v4, v8 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v5, v9 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB7_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -1184,29 +911,11 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4 -; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB8_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: @@ -1237,55 +946,21 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX11-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-NEXT: flat_atomic_min_f32 v0, v[0:1], v2 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB8_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX10-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB8_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: @@ -1363,25 +1038,9 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v4 -; GFX7-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX7-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB8_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmin ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result @@ -1395,28 +1054,11 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB9_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: @@ -1446,53 +1088,23 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_atomic_min_f32 v[0:1], v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB9_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX10-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: flat_atomic_fmin v[0:1], v2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB9_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: @@ -1567,24 +1179,9 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX7-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX7-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: flat_atomic_fmin v[0:1], v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v3, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB9_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fmin ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -1626,27 +1223,10 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX940-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX940-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 +; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB10_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: @@ -1680,54 +1260,19 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v7, v5 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 -; GFX10-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX10-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB10_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX90A-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: @@ -1788,30 +1333,9 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX7-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0 -; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v4, v[0:1] -; GFX7-NEXT: flat_load_dword v5, v[5:6] -; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX7-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB10_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v4 -; GFX7-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret double %result @@ -1852,25 +1376,10 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] -; GFX940-NEXT: .LBB11_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX940-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0 +; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB11_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: @@ -1903,50 +1412,20 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX10-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX10-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX10-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[2:3] +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX10-NEXT: v_mov_b32_e32 v5, v3 -; GFX10-NEXT: v_mov_b32_e32 v4, v2 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB11_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] -; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX90A-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: @@ -2003,28 +1482,9 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX7-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0 -; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v4, v[0:1] -; GFX7-NEXT: flat_load_dword v5, v[5:6] -; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX7-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB11_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -2038,31 +1498,12 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, s6 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v3, v1, v1 -; GFX12-NEXT: buffer_load_b32 v0, v2, s[0:3], null offen -; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: v_mov_b32_e32 v1, s6 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: v_max_num_f32_e32 v0, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f32_e32 v4, v0, v3 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB12_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: @@ -2095,65 +1536,27 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, s6 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v3, v1, v1 -; GFX11-NEXT: buffer_load_b32 v0, v2, s[0:3], 0 offen -; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_mov_b32_e32 v1, s6 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX11-NEXT: v_min_f32_e32 v4, v0, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v2, s[0:3], 0 offen glc +; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB12_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s18 +; GFX10-NEXT: v_mov_b32_e32 v1, s18 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s16 ; GFX10-NEXT: s_mov_b32 s7, s17 -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: buffer_load_dword v0, v2, s[4:7], 0 offen -; GFX10-NEXT: s_mov_b32 s8, 0 -; GFX10-NEXT: v_max_f32_e32 v3, v1, v1 -; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX10-NEXT: v_min_f32_e32 v4, v0, v3 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[4:7], 0 offen glc +; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 -; GFX10-NEXT: s_cbranch_execnz .LBB12_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: @@ -2255,28 +1658,10 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX7-NEXT: s_mov_b32 s5, s7 ; GFX7-NEXT: s_mov_b32 s6, s16 ; GFX7-NEXT: s_mov_b32 s7, s17 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: v_mov_b32_e32 v1, v0 -; GFX7-NEXT: buffer_load_dword v0, v2, s[4:7], 0 offen -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v1 -; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX7-NEXT: v_min_f32_e32 v4, v0, v3 -; GFX7-NEXT: v_mov_b32_e32 v0, v4 -; GFX7-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[4:7], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v1, s18 +; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB12_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmin ptr addrspace(7) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result @@ -2290,29 +1675,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_max_num_f32 v3, v0, v0 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen -; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f32_e32 v0, v1, v1 +; GFX12-NEXT: v_mov_b32_e32 v1, s6 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v4 -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB13_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: @@ -2344,61 +1712,27 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_max_f32 v3, v0, v0 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen -; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v4 -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB13_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s18 +; GFX10-NEXT: v_mov_b32_e32 v1, s18 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s16 ; GFX10-NEXT: s_mov_b32 s7, s17 -; GFX10-NEXT: v_max_f32_e32 v3, v0, v0 -; GFX10-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen -; GFX10-NEXT: s_mov_b32 s8, 0 -; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f32_e32 v0, v1, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 -; GFX10-NEXT: s_cbranch_execnz .LBB13_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: @@ -2497,27 +1831,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX7-NEXT: s_mov_b32 s5, s7 ; GFX7-NEXT: s_mov_b32 s6, s16 ; GFX7-NEXT: s_mov_b32 s7, s17 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v0 -; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX7-NEXT: v_mov_b32_e32 v5, v1 -; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v1, s18 +; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v1, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB13_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fmin ptr addrspace(7) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -2562,30 +1879,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v6, s6 -; GFX940-NEXT: v_mov_b32_e32 v2, v0 -; GFX940-NEXT: v_mov_b32_e32 v3, v1 -; GFX940-NEXT: buffer_load_dwordx2 v[0:1], v6, s[0:3], 0 offen -; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[10:11], v[0:1] -; GFX940-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11] -; GFX940-NEXT: v_min_f64 v[8:9], v[0:1], v[4:5] -; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[8:9] -; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[10:11] +; GFX940-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] -; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB14_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: @@ -2623,38 +1921,16 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, s18 +; GFX10-NEXT: v_mov_b32_e32 v2, s18 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s16 ; GFX10-NEXT: s_mov_b32 s7, s17 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen -; GFX10-NEXT: s_mov_b32 s8, 0 -; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v10, v1 -; GFX10-NEXT: v_mov_b32_e32 v9, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX10-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] -; GFX10-NEXT: v_mov_b32_e32 v0, v7 -; GFX10-NEXT: v_mov_b32_e32 v1, v8 -; GFX10-NEXT: v_mov_b32_e32 v2, v9 -; GFX10-NEXT: v_mov_b32_e32 v3, v10 -; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc +; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 -; GFX10-NEXT: s_cbranch_execnz .LBB14_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: @@ -2664,29 +1940,10 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX90A-NEXT: s_mov_b32 s5, s7 ; GFX90A-NEXT: s_mov_b32 s6, s16 ; GFX90A-NEXT: s_mov_b32 s7, s17 -; GFX90A-NEXT: v_mov_b32_e32 v6, s18 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen -; GFX90A-NEXT: s_mov_b64 s[8:9], 0 -; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11] -; GFX90A-NEXT: v_min_f64 v[8:9], v[0:1], v[4:5] -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[8:9] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[10:11], v[10:11] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc +; GFX90A-NEXT: v_mov_b32_e32 v2, s18 +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] -; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: @@ -2766,32 +2023,10 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX7-NEXT: s_mov_b32 s5, s7 ; GFX7-NEXT: s_mov_b32 s6, s16 ; GFX7-NEXT: s_mov_b32 s7, s17 -; GFX7-NEXT: v_mov_b32_e32 v6, s18 -; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen -; GFX7-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v10, v1 -; GFX7-NEXT: v_mov_b32_e32 v9, v0 -; GFX7-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX7-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v7 -; GFX7-NEXT: v_mov_b32_e32 v1, v8 -; GFX7-NEXT: v_mov_b32_e32 v2, v9 -; GFX7-NEXT: v_mov_b32_e32 v3, v10 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v2, s18 +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB14_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmin ptr addrspace(7) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret double %result @@ -2835,28 +2070,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v6, s6 -; GFX940-NEXT: buffer_load_dwordx2 v[2:3], v6, s[0:3], 0 offen -; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX940-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX940-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX940-NEXT: v_mov_b64_e32 v[10:11], v[2:3] -; GFX940-NEXT: v_mov_b64_e32 v[8:9], v[0:1] +; GFX940-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v6, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[2:3] -; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[8:9] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB15_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: @@ -2892,36 +2110,16 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, s18 +; GFX10-NEXT: v_mov_b32_e32 v2, s18 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s16 ; GFX10-NEXT: s_mov_b32 s7, s17 -; GFX10-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX10-NEXT: buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen -; GFX10-NEXT: s_mov_b32 s8, 0 -; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX10-NEXT: v_mov_b32_e32 v10, v3 -; GFX10-NEXT: v_mov_b32_e32 v9, v2 -; GFX10-NEXT: v_mov_b32_e32 v8, v1 -; GFX10-NEXT: v_mov_b32_e32 v7, v0 -; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX10-NEXT: v_mov_b32_e32 v2, v7 -; GFX10-NEXT: v_mov_b32_e32 v3, v8 -; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 -; GFX10-NEXT: s_cbranch_execnz .LBB15_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: @@ -2931,27 +2129,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX90A-NEXT: s_mov_b32 s5, s7 ; GFX90A-NEXT: s_mov_b32 s6, s16 ; GFX90A-NEXT: s_mov_b32 s7, s17 -; GFX90A-NEXT: v_mov_b32_e32 v6, s18 -; GFX90A-NEXT: buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen -; GFX90A-NEXT: s_mov_b64 s[8:9], 0 -; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX90A-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v6, s[4:7], 0 offen glc +; GFX90A-NEXT: v_mov_b32_e32 v2, s18 +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[2:3] -; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[8:9], v[8:9] op_sel:[0,1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: @@ -3027,30 +2208,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX7-NEXT: s_mov_b32 s5, s7 ; GFX7-NEXT: s_mov_b32 s6, s16 ; GFX7-NEXT: s_mov_b32 s7, s17 -; GFX7-NEXT: v_mov_b32_e32 v6, s18 -; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen -; GFX7-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v10, v3 -; GFX7-NEXT: v_mov_b32_e32 v9, v2 -; GFX7-NEXT: v_mov_b32_e32 v8, v1 -; GFX7-NEXT: v_mov_b32_e32 v7, v0 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v2, s18 +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v2, v7 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v8 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB15_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fmin ptr addrspace(7) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll index ec0408236975d..172ce4c065e13 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll @@ -13,8 +13,8 @@ ; float ; -------------------------------------------------------------------- -define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset: +define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -29,7 +29,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, v0 @@ -58,7 +58,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, s6 @@ -69,7 +69,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s18 @@ -84,7 +84,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 @@ -116,7 +116,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 @@ -149,7 +149,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -182,7 +182,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s11, s17 @@ -195,7 +195,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s17 @@ -209,12 +209,12 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset: +define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -229,7 +229,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, s6 @@ -257,7 +257,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7) ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, s6 @@ -268,7 +268,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s18 @@ -283,7 +283,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_mov_b32 s11, s17 @@ -314,7 +314,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7) ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_mov_b32 s11, s17 @@ -346,7 +346,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7) ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s11, s17 @@ -378,7 +378,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7) ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s11, s17 @@ -391,7 +391,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s17 @@ -405,12 +405,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr addrspace(7) %ptr, float %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall: +define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -444,7 +444,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 @@ -505,7 +505,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad ; GFX940-NEXT: v_mov_b32_e32 v0, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s1, exec_lo @@ -535,7 +535,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s5, exec_lo @@ -564,7 +564,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 @@ -623,7 +623,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 @@ -683,7 +683,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4 @@ -743,7 +743,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b64 s[6:7], exec @@ -769,7 +769,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b64 s[6:7], exec @@ -796,90 +796,79 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -; -------------------------------------------------------------------- -; double -; -------------------------------------------------------------------- - -define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset(ptr addrspace(7) inreg %ptr, double %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset: +define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: v_mov_b32_e32 v1, s6 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB3_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 +; GFX940-NEXT: s_addk_i32 s6, 0x400 +; GFX940-NEXT: s_mov_b64 s[4:5], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, s6 +; GFX940-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX940-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX940-NEXT: s_cbranch_execnz .LBB3_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x800 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1 +; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX11-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] +; GFX11-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX11-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -888,64 +877,99 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset(ptr addrspace(7) ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s18 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s18 ; GFX10-NEXT: s_mov_b32 s11, s17 ; GFX10-NEXT: s_mov_b32 s10, s16 ; GFX10-NEXT: s_mov_b32 s9, s7 ; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_add_i32 s4, s18, 0x400 +; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX10-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX10-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB3_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_mov_b32 s11, s17 ; GFX90A-NEXT: s_mov_b32 s10, s16 ; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v2, s18 -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX90A-NEXT: v_mov_b32_e32 v0, s18 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v1, v0 ; GFX908-NEXT: s_mov_b32 s11, s17 ; GFX908-NEXT: s_mov_b32 s10, s16 ; GFX908-NEXT: s_mov_b32 s9, s7 ; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX908-NEXT: s_add_i32 s6, s18, 0x800 +; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s6, s18, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s6 +; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v10, v1 -; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX908-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v7 -; GFX908-NEXT: v_mov_b32_e32 v1, v8 -; GFX908-NEXT: v_mov_b32_e32 v2, v9 -; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX908-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB3_1 @@ -953,36 +977,32 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset(ptr addrspace(7) ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: s_mov_b32 s11, s17 ; GFX8-NEXT: s_mov_b32 s10, s16 ; GFX8-NEXT: s_mov_b32 s9, s7 ; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX8-NEXT: s_add_i32 s6, s18, 0x800 +; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s6, s18, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v10, v1 -; GFX8-NEXT: v_mov_b32_e32 v9, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX8-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v7 -; GFX8-NEXT: v_mov_b32_e32 v1, v8 -; GFX8-NEXT: v_mov_b32_e32 v2, v9 -; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX8-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB3_1 @@ -990,669 +1010,1659 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset(ptr addrspace(7) ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, v0 ; GFX7-NEXT: s_mov_b32 s11, s17 ; GFX7-NEXT: s_mov_b32 s10, s16 ; GFX7-NEXT: s_mov_b32 s9, s7 ; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX7-NEXT: v_mov_b32_e32 v0, s18 +; GFX7-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v1 +; GFX7-NEXT: v_mov_b32_e32 v3, s6 +; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v5, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX7-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB3_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, v0 ; GFX6-NEXT: s_mov_b32 s11, s17 ; GFX6-NEXT: s_mov_b32 s10, s16 ; GFX6-NEXT: s_mov_b32 s9, s7 ; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s18 +; GFX6-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v1 +; GFX6-NEXT: v_mov_b32_e32 v3, s6 +; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v5, v0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX6-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v0, v4 +; GFX6-NEXT: v_mov_b32_e32 v1, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB3_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst - ret double %result + %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 + ret float %result } -define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset(ptr addrspace(7) inreg %ptr, double %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset: +define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, s6 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_mov_b32_e32 v1, s6 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB4_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, s6 -; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX11-NEXT: s_add_i32 s4, s6, 0x800 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 -; GFX11-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 +; GFX940-NEXT: s_addk_i32 s6, 0x400 +; GFX940-NEXT: s_mov_b64 s[4:5], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, s6 +; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX940-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX940-NEXT: s_cbranch_execnz .LBB4_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s6 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 -; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc +; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB4_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s18 +; GFX10-NEXT: v_mov_b32_e32 v1, s18 ; GFX10-NEXT: s_mov_b32 s11, s17 ; GFX10-NEXT: s_mov_b32 s10, s16 ; GFX10-NEXT: s_mov_b32 s9, s7 ; GFX10-NEXT: s_mov_b32 s8, s6 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_mov_b32 s11, s17 ; GFX90A-NEXT: s_mov_b32 s10, s16 ; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v2, s18 -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 +; GFX90A-NEXT: v_mov_b32_e32 v0, s18 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v1, v0 ; GFX908-NEXT: s_mov_b32 s11, s17 ; GFX908-NEXT: s_mov_b32 s10, s16 ; GFX908-NEXT: s_mov_b32 s9, s7 ; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v2, s18 -; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v2, s[8:11], 0 offen offset:2048 -; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX908-NEXT: s_add_i32 s6, s18, 0x800 +; GFX908-NEXT: v_mov_b32_e32 v0, s18 +; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s6, s18, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s6 +; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v10, v3 -; GFX908-NEXT: v_mov_b32_e32 v9, v2 -; GFX908-NEXT: v_mov_b32_e32 v8, v1 -; GFX908-NEXT: v_mov_b32_e32 v7, v0 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX908-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] -; GFX908-NEXT: v_mov_b32_e32 v2, v7 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v8 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB4_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: s_mov_b32 s11, s17 ; GFX8-NEXT: s_mov_b32 s10, s16 ; GFX8-NEXT: s_mov_b32 s9, s7 ; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v2, s18 -; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v2, s[8:11], 0 offen offset:2048 -; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX8-NEXT: s_add_i32 s6, s18, 0x800 +; GFX8-NEXT: v_mov_b32_e32 v0, s18 +; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s6, s18, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v10, v3 -; GFX8-NEXT: v_mov_b32_e32 v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v8, v1 -; GFX8-NEXT: v_mov_b32_e32 v7, v0 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX8-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, v7 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v8 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB4_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s11, s17 ; GFX7-NEXT: s_mov_b32 s10, s16 ; GFX7-NEXT: s_mov_b32 s9, s7 ; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 +; GFX7-NEXT: v_mov_b32_e32 v1, s18 +; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s17 ; GFX6-NEXT: s_mov_b32 s10, s16 ; GFX6-NEXT: s_mov_b32 s9, s7 ; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 +; GFX6-NEXT: v_mov_b32_e32 v1, s18 +; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst - ret void + %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 + ret float %result } -define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall(ptr addrspace(7) %ptr, double %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall: +; -------------------------------------------------------------------- +; double +; -------------------------------------------------------------------- + +define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, double %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 -; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 -; GFX12-NEXT: v_add_nc_u32_e32 v15, 0x800, v4 -; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_readfirstlane_b32 s4, v9 -; GFX12-NEXT: v_readfirstlane_b32 s5, v10 -; GFX12-NEXT: v_readfirstlane_b32 s6, v7 -; GFX12-NEXT: v_readfirstlane_b32 s7, v8 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 -; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048 -; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB5_1 -; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[5:6], v[5:6] -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB5_3: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB5_4 Depth 2 +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 +; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[13:14], v[13:14] -; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[11:12], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 -; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 -; GFX12-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 -; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX12-NEXT: v_readfirstlane_b32 s4, v9 -; GFX12-NEXT: v_readfirstlane_b32 s5, v10 -; GFX12-NEXT: v_readfirstlane_b32 s6, v7 -; GFX12-NEXT: v_readfirstlane_b32 s7, v8 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB5_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1 -; GFX12-NEXT: s_mov_b32 exec_lo, s2 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] -; GFX12-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB5_3 -; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_execnz .LBB5_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v6 -; GFX940-NEXT: v_mov_b32_e32 v6, v5 -; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_max_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0 -; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX940-NEXT: ; implicit-def: $vgpr4 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB5_1 -; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] +; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-NEXT: s_add_i32 s4, s6, 0x800 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 -; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x800, v4 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_readfirstlane_b32 s4, v9 -; GFX11-NEXT: v_readfirstlane_b32 s5, v10 -; GFX11-NEXT: v_readfirstlane_b32 s6, v7 -; GFX11-NEXT: v_readfirstlane_b32 s7, v8 -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048 -; GFX11-NEXT: ; implicit-def: $vgpr4 -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB5_1 -; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB5_3: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB5_4 Depth 2 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] -; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[11:12], v[0:1], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 -; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 -; GFX11-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 -; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX11-NEXT: v_readfirstlane_b32 s4, v9 -; GFX11-NEXT: v_readfirstlane_b32 s5, v10 -; GFX11-NEXT: v_readfirstlane_b32 s6, v7 -; GFX11-NEXT: v_readfirstlane_b32 s7, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB5_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1 -; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX11-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] -; GFX11-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB5_3 -; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB5_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s5, exec_lo +; GFX10-NEXT: v_mov_b32_e32 v2, s18 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_readfirstlane_b32 s8, v0 -; GFX10-NEXT: v_readfirstlane_b32 s9, v1 -; GFX10-NEXT: v_readfirstlane_b32 s10, v2 -; GFX10-NEXT: v_readfirstlane_b32 s11, v3 -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] -; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] -; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_fmax_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc -; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX10-NEXT: ; implicit-def: $vgpr4 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB5_1 -; GFX10-NEXT: ; %bb.2: -; GFX10-NEXT: s_mov_b32 exec_lo, s5 +; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v5 -; GFX10-NEXT: v_mov_b32_e32 v1, v6 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v6 -; GFX90A-NEXT: v_mov_b32_e32 v6, v5 -; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 -; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 -; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 -; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_max_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX90A-NEXT: ; implicit-def: $vgpr4 -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 -; GFX90A-NEXT: ; %bb.2: -; GFX90A-NEXT: s_mov_b64 exec, s[6:7] +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: v_mov_b32_e32 v2, s18 +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v8, v3 -; GFX908-NEXT: v_mov_b32_e32 v7, v2 +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s18 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: s_add_i32 s6, s18, 0x800 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s6 +; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v10, v1 ; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_add_u32_e32 v15, 0x800, v4 -; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_readfirstlane_b32 s8, v9 -; GFX908-NEXT: v_readfirstlane_b32 s9, v10 -; GFX908-NEXT: v_readfirstlane_b32 s10, v7 -; GFX908-NEXT: v_readfirstlane_b32 s11, v8 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] -; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] -; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 -; GFX908-NEXT: ; implicit-def: $vgpr4 -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB5_1 -; GFX908-NEXT: ; %bb.2: -; GFX908-NEXT: s_mov_b64 exec, s[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] -; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB5_3: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB5_4 Depth 2 +; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX908-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v7 +; GFX908-NEXT: v_mov_b32_e32 v1, v8 +; GFX908-NEXT: v_mov_b32_e32 v2, v9 +; GFX908-NEXT: v_mov_b32_e32 v3, v10 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] -; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_max_f64 v[11:12], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v11 -; GFX908-NEXT: v_mov_b32_e32 v1, v12 -; GFX908-NEXT: v_mov_b32_e32 v2, v13 -; GFX908-NEXT: v_mov_b32_e32 v3, v14 -; GFX908-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 -; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX908-NEXT: v_readfirstlane_b32 s8, v9 -; GFX908-NEXT: v_readfirstlane_b32 s9, v10 -; GFX908-NEXT: v_readfirstlane_b32 s10, v7 -; GFX908-NEXT: v_readfirstlane_b32 s11, v8 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] -; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] -; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB5_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s18 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: s_add_i32 s6, s18, 0x800 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v10, v1 +; GFX8-NEXT: v_mov_b32_e32 v9, v0 +; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX8-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v7 +; GFX8-NEXT: v_mov_b32_e32 v1, v8 +; GFX8-NEXT: v_mov_b32_e32 v2, v9 +; GFX8-NEXT: v_mov_b32_e32 v3, v10 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB5_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s18 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s18 +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret double %result +} + +define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, double %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v2, s6 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] +; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048 +; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] +; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_execnz .LBB6_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v2, s6 +; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX11-NEXT: s_add_i32 s4, s6, 0x800 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 +; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] +; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB6_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v2, s18 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: v_mov_b32_e32 v2, s18 +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: v_mov_b32_e32 v2, s18 +; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v2, s[8:11], 0 offen offset:2048 +; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX908-NEXT: s_add_i32 s6, s18, 0x800 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s6 +; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v10, v3 +; GFX908-NEXT: v_mov_b32_e32 v9, v2 +; GFX908-NEXT: v_mov_b32_e32 v8, v1 +; GFX908-NEXT: v_mov_b32_e32 v7, v0 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v2, v7 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v3, v8 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB6_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s18 +; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v2, s[8:11], 0 offen offset:2048 +; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX8-NEXT: s_add_i32 s6, s18, 0x800 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v10, v3 +; GFX8-NEXT: v_mov_b32_e32 v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v8, v1 +; GFX8-NEXT: v_mov_b32_e32 v7, v0 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v2, v7 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, v8 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB6_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s18 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s18 +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 + %unused = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, double %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 +; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: v_add_nc_u32_e32 v15, 0x800, v4 +; GFX12-NEXT: s_mov_b32 s1, exec_lo +; GFX12-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_readfirstlane_b32 s4, v9 +; GFX12-NEXT: v_readfirstlane_b32 s5, v10 +; GFX12-NEXT: v_readfirstlane_b32 s6, v7 +; GFX12-NEXT: v_readfirstlane_b32 s7, v8 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] +; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048 +; GFX12-NEXT: ; implicit-def: $vgpr4 +; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB7_1 +; GFX12-NEXT: ; %bb.2: +; GFX12-NEXT: s_mov_b32 exec_lo, s1 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[5:6], v[5:6] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB7_3: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Loop Header: Depth=1 +; GFX12-NEXT: ; Child Loop BB7_4 Depth 2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[13:14], v[13:14] +; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[11:12], v[0:1], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 +; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 +; GFX12-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 +; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX12-NEXT: v_readfirstlane_b32 s4, v9 +; GFX12-NEXT: v_readfirstlane_b32 s5, v10 +; GFX12-NEXT: v_readfirstlane_b32 s6, v7 +; GFX12-NEXT: v_readfirstlane_b32 s7, v8 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] +; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB7_4 +; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 +; GFX12-NEXT: s_mov_b32 exec_lo, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] +; GFX12-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB7_3 +; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, v6 +; GFX940-NEXT: v_mov_b32_e32 v6, v5 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_readfirstlane_b32 s4, v0 +; GFX940-NEXT: v_readfirstlane_b32 s5, v1 +; GFX940-NEXT: v_readfirstlane_b32 s6, v2 +; GFX940-NEXT: v_readfirstlane_b32 s7, v3 +; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_atomic_max_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX940-NEXT: ; implicit-def: $vgpr4 +; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB7_1 +; GFX940-NEXT: ; %bb.2: +; GFX940-NEXT: s_mov_b64 exec, s[2:3] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 +; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x800, v4 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_readfirstlane_b32 s4, v9 +; GFX11-NEXT: v_readfirstlane_b32 s5, v10 +; GFX11-NEXT: v_readfirstlane_b32 s6, v7 +; GFX11-NEXT: v_readfirstlane_b32 s7, v8 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] +; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB7_1 +; GFX11-NEXT: ; %bb.2: +; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB7_3: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Loop Header: Depth=1 +; GFX11-NEXT: ; Child Loop BB7_4 Depth 2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[11:12], v[0:1], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 +; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 +; GFX11-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 +; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX11-NEXT: v_readfirstlane_b32 s4, v9 +; GFX11-NEXT: v_readfirstlane_b32 s5, v10 +; GFX11-NEXT: v_readfirstlane_b32 s6, v7 +; GFX11-NEXT: v_readfirstlane_b32 s7, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] +; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc +; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB7_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 +; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] +; GFX11-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB7_3 +; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s5, exec_lo +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_readfirstlane_b32 s8, v0 +; GFX10-NEXT: v_readfirstlane_b32 s9, v1 +; GFX10-NEXT: v_readfirstlane_b32 s10, v2 +; GFX10-NEXT: v_readfirstlane_b32 s11, v3 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] +; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] +; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_and_saveexec_b32 s4, s4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_atomic_fmax_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX10-NEXT: ; implicit-def: $vgpr4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB7_1 +; GFX10-NEXT: ; %bb.2: +; GFX10-NEXT: s_mov_b32 exec_lo, s5 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v5 +; GFX10-NEXT: v_mov_b32_e32 v1, v6 +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v6 +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: s_mov_b64 s[6:7], exec +; GFX90A-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 +; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 +; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 +; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_atomic_max_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr4 +; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 +; GFX90A-NEXT: ; %bb.2: +; GFX90A-NEXT: s_mov_b64 exec, s[6:7] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v8, v3 +; GFX908-NEXT: v_mov_b32_e32 v7, v2 +; GFX908-NEXT: v_mov_b32_e32 v10, v1 +; GFX908-NEXT: v_mov_b32_e32 v9, v0 +; GFX908-NEXT: v_add_u32_e32 v15, 0x800, v4 +; GFX908-NEXT: s_mov_b64 s[6:7], exec +; GFX908-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_readfirstlane_b32 s8, v9 +; GFX908-NEXT: v_readfirstlane_b32 s9, v10 +; GFX908-NEXT: v_readfirstlane_b32 s10, v7 +; GFX908-NEXT: v_readfirstlane_b32 s11, v8 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] +; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] +; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 +; GFX908-NEXT: ; implicit-def: $vgpr4 +; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB7_1 +; GFX908-NEXT: ; %bb.2: +; GFX908-NEXT: s_mov_b64 exec, s[6:7] +; GFX908-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB7_3: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Loop Header: Depth=1 +; GFX908-NEXT: ; Child Loop BB7_4 Depth 2 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] +; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_max_f64 v[11:12], v[0:1], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v11 +; GFX908-NEXT: v_mov_b32_e32 v1, v12 +; GFX908-NEXT: v_mov_b32_e32 v2, v13 +; GFX908-NEXT: v_mov_b32_e32 v3, v14 +; GFX908-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 +; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX908-NEXT: v_readfirstlane_b32 s8, v9 +; GFX908-NEXT: v_readfirstlane_b32 s9, v10 +; GFX908-NEXT: v_readfirstlane_b32 s10, v7 +; GFX908-NEXT: v_readfirstlane_b32 s11, v8 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] +; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] +; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc +; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB7_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 +; GFX908-NEXT: s_mov_b64 exec, s[12:13] +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] +; GFX908-NEXT: v_mov_b32_e32 v14, v1 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v13, v0 +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB7_3 +; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v8, v3 +; GFX8-NEXT: v_mov_b32_e32 v7, v2 +; GFX8-NEXT: v_mov_b32_e32 v10, v1 +; GFX8-NEXT: v_mov_b32_e32 v9, v0 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0x800, v4 +; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_readfirstlane_b32 s8, v9 +; GFX8-NEXT: v_readfirstlane_b32 s9, v10 +; GFX8-NEXT: v_readfirstlane_b32 s10, v7 +; GFX8-NEXT: v_readfirstlane_b32 s11, v8 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] +; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] +; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 +; GFX8-NEXT: ; implicit-def: $vgpr4 +; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB7_1 +; GFX8-NEXT: ; %bb.2: +; GFX8-NEXT: s_mov_b64 exec, s[6:7] +; GFX8-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB7_3: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Loop Header: Depth=1 +; GFX8-NEXT: ; Child Loop BB7_4 Depth 2 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] +; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_max_f64 v[11:12], v[0:1], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v11 +; GFX8-NEXT: v_mov_b32_e32 v1, v12 +; GFX8-NEXT: v_mov_b32_e32 v2, v13 +; GFX8-NEXT: v_mov_b32_e32 v3, v14 +; GFX8-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 +; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX8-NEXT: v_readfirstlane_b32 s8, v9 +; GFX8-NEXT: v_readfirstlane_b32 s9, v10 +; GFX8-NEXT: v_readfirstlane_b32 s10, v7 +; GFX8-NEXT: v_readfirstlane_b32 s11, v8 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] +; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] +; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc +; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB7_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 +; GFX8-NEXT: s_mov_b64 exec, s[12:13] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] +; GFX8-NEXT: v_mov_b32_e32 v14, v1 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v13, v0 +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB7_3 +; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[6:7], exec +; GFX7-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_readfirstlane_b32 s8, v0 +; GFX7-NEXT: v_readfirstlane_b32 s9, v1 +; GFX7-NEXT: v_readfirstlane_b32 s10, v2 +; GFX7-NEXT: v_readfirstlane_b32 s11, v3 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_atomic_fmax_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX7-NEXT: ; implicit-def: $vgpr4 +; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB7_1 +; GFX7-NEXT: ; %bb.2: +; GFX7-NEXT: s_mov_b64 exec, s[6:7] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, v5 +; GFX7-NEXT: v_mov_b32_e32 v1, v6 +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b64 s[6:7], exec +; GFX6-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_readfirstlane_b32 s8, v0 +; GFX6-NEXT: v_readfirstlane_b32 s9, v1 +; GFX6-NEXT: v_readfirstlane_b32 s10, v2 +; GFX6-NEXT: v_readfirstlane_b32 s11, v3 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: buffer_atomic_fmax_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc +; GFX6-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX6-NEXT: ; implicit-def: $vgpr4 +; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB7_1 +; GFX6-NEXT: ; %bb.2: +; GFX6-NEXT: s_mov_b64 exec, s[6:7] +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, v5 +; GFX6-NEXT: v_mov_b32_e32 v1, v6 +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret double %result +} + +define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, double %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 +; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_execnz .LBB8_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-NEXT: s_add_i32 s4, s6, 0x800 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX11-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s18 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 +; GFX10-NEXT: s_add_i32 s4, s18, 0x800 +; GFX10-NEXT: v_mov_b32_e32 v6, s4 +; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v10, v1 +; GFX10-NEXT: v_mov_b32_e32 v9, v0 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX10-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] +; GFX10-NEXT: v_mov_b32_e32 v0, v7 +; GFX10-NEXT: v_mov_b32_e32 v1, v8 +; GFX10-NEXT: v_mov_b32_e32 v2, v9 +; GFX10-NEXT: v_mov_b32_e32 v3, v10 +; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB8_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s18 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 +; GFX90A-NEXT: s_add_i32 s6, s18, 0x800 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX90A-NEXT: v_mov_b32_e32 v6, s6 +; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11] +; GFX90A-NEXT: v_max_f64 v[8:9], v[0:1], v[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[8:9] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[10:11], v[10:11] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s18 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: s_add_i32 s6, s18, 0x800 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s6 +; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v10, v1 +; GFX908-NEXT: v_mov_b32_e32 v9, v0 +; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX908-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v7 +; GFX908-NEXT: v_mov_b32_e32 v1, v8 +; GFX908-NEXT: v_mov_b32_e32 v2, v9 +; GFX908-NEXT: v_mov_b32_e32 v3, v10 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB8_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s18 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: s_add_i32 s6, s18, 0x800 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v10, v1 +; GFX8-NEXT: v_mov_b32_e32 v9, v0 +; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX8-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v7 +; GFX8-NEXT: v_mov_b32_e32 v1, v8 +; GFX8-NEXT: v_mov_b32_e32 v2, v9 +; GFX8-NEXT: v_mov_b32_e32 v3, v10 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB8_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s18 +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 +; GFX7-NEXT: s_add_i32 s6, s18, 0x800 +; GFX7-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v10, v1 +; GFX7-NEXT: v_mov_b32_e32 v9, v0 +; GFX7-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX7-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v7 +; GFX7-NEXT: v_mov_b32_e32 v1, v8 +; GFX7-NEXT: v_mov_b32_e32 v2, v9 +; GFX7-NEXT: v_mov_b32_e32 v3, v10 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s18 +; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 +; GFX6-NEXT: s_add_i32 s6, s18, 0x800 +; GFX6-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: v_mov_b32_e32 v6, s6 +; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v10, v1 +; GFX6-NEXT: v_mov_b32_e32 v9, v0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX6-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] +; GFX6-NEXT: v_mov_b32_e32 v0, v7 +; GFX6-NEXT: v_mov_b32_e32 v1, v8 +; GFX6-NEXT: v_mov_b32_e32 v2, v9 +; GFX6-NEXT: v_mov_b32_e32 v3, v10 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB8_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 + ret double %result +} + +define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, double %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 +; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_execnz .LBB9_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-NEXT: s_add_i32 s4, s6, 0x800 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX11-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB9_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v2, s18 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: v_mov_b32_e32 v2, s18 +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s18 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: s_add_i32 s6, s18, 0x800 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s6 +; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB5_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1 -; GFX908-NEXT: s_mov_b64 exec, s[12:13] +; GFX908-NEXT: v_mov_b32_e32 v10, v1 +; GFX908-NEXT: v_mov_b32_e32 v9, v0 +; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX908-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v7 +; GFX908-NEXT: v_mov_b32_e32 v1, v8 +; GFX908-NEXT: v_mov_b32_e32 v2, v9 +; GFX908-NEXT: v_mov_b32_e32 v3, v10 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] -; GFX908-NEXT: v_mov_b32_e32 v14, v1 -; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v13, v0 ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB5_3 -; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB9_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v8, v3 -; GFX8-NEXT: v_mov_b32_e32 v7, v2 +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s18 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: s_add_i32 s6, s18, 0x800 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v10, v1 ; GFX8-NEXT: v_mov_b32_e32 v9, v0 -; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0x800, v4 -; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_readfirstlane_b32 s8, v9 -; GFX8-NEXT: v_readfirstlane_b32 s9, v10 -; GFX8-NEXT: v_readfirstlane_b32 s10, v7 -; GFX8-NEXT: v_readfirstlane_b32 s11, v8 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] -; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] -; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 -; GFX8-NEXT: ; implicit-def: $vgpr4 -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB5_1 -; GFX8-NEXT: ; %bb.2: -; GFX8-NEXT: s_mov_b64 exec, s[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] -; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: .LBB5_3: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB5_4 Depth 2 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] -; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_max_f64 v[11:12], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v11 -; GFX8-NEXT: v_mov_b32_e32 v1, v12 -; GFX8-NEXT: v_mov_b32_e32 v2, v13 -; GFX8-NEXT: v_mov_b32_e32 v3, v14 -; GFX8-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 -; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX8-NEXT: v_readfirstlane_b32 s8, v9 -; GFX8-NEXT: v_readfirstlane_b32 s9, v10 -; GFX8-NEXT: v_readfirstlane_b32 s10, v7 -; GFX8-NEXT: v_readfirstlane_b32 s11, v8 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] -; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] -; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB5_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1 -; GFX8-NEXT: s_mov_b64 exec, s[12:13] +; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX8-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v7 +; GFX8-NEXT: v_mov_b32_e32 v1, v8 +; GFX8-NEXT: v_mov_b32_e32 v2, v9 +; GFX8-NEXT: v_mov_b32_e32 v3, v10 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] -; GFX8-NEXT: v_mov_b32_e32 v14, v1 -; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v13, v0 ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB5_3 -; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB9_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_readfirstlane_b32 s8, v0 -; GFX7-NEXT: v_readfirstlane_b32 s9, v1 -; GFX7-NEXT: v_readfirstlane_b32 s10, v2 -; GFX7-NEXT: v_readfirstlane_b32 s11, v3 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_fmax_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc -; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX7-NEXT: ; implicit-def: $vgpr4 -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB5_1 -; GFX7-NEXT: ; %bb.2: -; GFX7-NEXT: s_mov_b64 exec, s[6:7] +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s18 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, v5 -; GFX7-NEXT: v_mov_b32_e32 v1, v6 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_readfirstlane_b32 s8, v0 -; GFX6-NEXT: v_readfirstlane_b32 s9, v1 -; GFX6-NEXT: v_readfirstlane_b32 s10, v2 -; GFX6-NEXT: v_readfirstlane_b32 s11, v3 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_fmax_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc -; GFX6-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX6-NEXT: ; implicit-def: $vgpr4 -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB5_1 -; GFX6-NEXT: ; %bb.2: -; GFX6-NEXT: s_mov_b64 exec, s[6:7] +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s18 +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v5 -; GFX6-NEXT: v_mov_b32_e32 v1, v6 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret double %result } @@ -1660,8 +2670,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall(ptr a ; half ; -------------------------------------------------------------------- -define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) inreg %ptr, half %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset: +define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1680,7 +2690,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i ; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -1703,13 +2713,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB6_1 +; GFX12-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_addk_i32 s6, 0x200 @@ -1722,7 +2732,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i ; GFX940-NEXT: s_not_b32 s7, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX940-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v1 @@ -1739,13 +2749,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB6_1 +; GFX940-NEXT: s_cbranch_execnz .LBB10_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_addk_i32 s6, 0x200 @@ -1761,7 +2771,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i ; GFX11-NEXT: s_not_b32 s6, s5 ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -1784,13 +2794,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB6_1 +; GFX11-NEXT: s_cbranch_execnz .LBB10_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s18, 0x200 @@ -1807,7 +2817,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX10-NEXT: s_not_b32 s6, s5 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -1826,13 +2836,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i ; GFX10-NEXT: v_mov_b32_e32 v1, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB6_1 +; GFX10-NEXT: s_cbranch_execnz .LBB10_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s18, 0x200 @@ -1849,7 +2859,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i ; GFX90A-NEXT: s_not_b32 s7, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v1 @@ -1865,13 +2875,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s18, 0x200 @@ -1888,7 +2898,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i ; GFX908-NEXT: s_not_b32 s7, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1 @@ -1905,13 +2915,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB6_1 +; GFX908-NEXT: s_cbranch_execnz .LBB10_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s18, 0x200 @@ -1928,7 +2938,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i ; GFX8-NEXT: s_not_b32 s7, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1 @@ -1946,13 +2956,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB6_1 +; GFX8-NEXT: s_cbranch_execnz .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s18, 0x200 @@ -1970,7 +2980,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 @@ -1989,14 +2999,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB6_1 +; GFX7-NEXT: s_cbranch_execnz .LBB10_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s18, 0x200 @@ -2014,7 +3024,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 @@ -2034,7 +3044,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB6_1 +; GFX6-NEXT: s_cbranch_execnz .LBB10_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 @@ -2042,12 +3052,12 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result } -define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7) inreg %ptr, half %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset: +define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -2066,7 +3076,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7) ; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -2089,12 +3099,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7) ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB7_1 +; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_addk_i32 s6, 0x200 @@ -2107,7 +3117,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7) ; GFX940-NEXT: s_not_b32 s7, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX940-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v1 @@ -2124,12 +3134,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7) ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB7_1 +; GFX940-NEXT: s_cbranch_execnz .LBB11_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_addk_i32 s6, 0x200 @@ -2145,7 +3155,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7) ; GFX11-NEXT: s_not_b32 s6, s5 ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -2168,12 +3178,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7) ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB7_1 +; GFX11-NEXT: s_cbranch_execnz .LBB11_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s18, 0x200 @@ -2190,7 +3200,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7) ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX10-NEXT: s_not_b32 s6, s5 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -2209,12 +3219,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7) ; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB7_1 +; GFX10-NEXT: s_cbranch_execnz .LBB11_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s18, 0x200 @@ -2231,7 +3241,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7) ; GFX90A-NEXT: s_not_b32 s7, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v1 @@ -2247,12 +3257,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7) ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s18, 0x200 @@ -2269,7 +3279,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7) ; GFX908-NEXT: s_not_b32 s7, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1 @@ -2286,12 +3296,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7) ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB7_1 +; GFX908-NEXT: s_cbranch_execnz .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s18, 0x200 @@ -2308,7 +3318,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7) ; GFX8-NEXT: s_not_b32 s7, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1 @@ -2326,12 +3336,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7) ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB7_1 +; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s18, 0x200 @@ -2349,7 +3359,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7) ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 @@ -2368,12 +3378,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7) ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB7_1 +; GFX7-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s18, 0x200 @@ -2391,7 +3401,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7) ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 @@ -2411,18 +3421,18 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7) ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v1, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB7_1 +; GFX6-NEXT: s_cbranch_execnz .LBB11_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fmax ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr addrspace(7) %ptr, half %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall: +define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, half %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -2438,7 +3448,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff ; GFX12-NEXT: v_not_b32_e32 v9, v6 -; GFX12-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 @@ -2451,14 +3461,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB8_1 +; GFX12-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_max_num_f16_e32 v10, v5, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX12-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX12-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX12-NEXT: s_mov_b32 s2, exec_lo @@ -2474,7 +3484,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX12-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 @@ -2489,8 +3499,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB8_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX12-NEXT: s_cbranch_execnz .LBB12_4 +; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -2499,13 +3509,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB8_3 +; GFX12-NEXT: s_cbranch_execnz .LBB12_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4 @@ -2516,7 +3526,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v8, s0 ; GFX940-NEXT: v_not_b32_e32 v10, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 ; GFX940-NEXT: v_readfirstlane_b32 s6, v2 @@ -2528,14 +3538,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB8_1 +; GFX940-NEXT: s_cbranch_execnz .LBB12_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 ; GFX940-NEXT: v_max_f16_e32 v11, v5, v5 -; GFX940-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX940-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX940-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v4, v8, v7 ; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 @@ -2545,7 +3555,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX940-NEXT: s_mov_b64 s[8:9], exec ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX940-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 @@ -2559,8 +3569,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB8_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX940-NEXT: s_cbranch_execnz .LBB12_4 +; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -2568,13 +3578,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB8_3 +; GFX940-NEXT: s_cbranch_execnz .LBB12_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 @@ -2587,7 +3597,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff ; GFX11-NEXT: v_not_b32_e32 v9, v6 -; GFX11-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 ; GFX11-NEXT: v_readfirstlane_b32 s6, v2 @@ -2600,14 +3610,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-NEXT: s_cbranch_execnz .LBB12_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_max_f16_e32 v10, v5, v5 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX11-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX11-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX11-NEXT: s_mov_b32 s2, exec_lo @@ -2622,7 +3632,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX11-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX11-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 @@ -2637,8 +3647,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB8_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX11-NEXT: s_cbranch_execnz .LBB12_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -2648,13 +3658,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB8_3 +; GFX11-NEXT: s_cbranch_execnz .LBB12_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 @@ -2665,7 +3675,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v6 ; GFX10-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff ; GFX10-NEXT: v_not_b32_e32 v9, v6 -; GFX10-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 ; GFX10-NEXT: v_readfirstlane_b32 s10, v2 @@ -2677,13 +3687,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB8_1 +; GFX10-NEXT: s_cbranch_execnz .LBB12_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: v_max_f16_e32 v10, v5, v5 -; GFX10-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX10-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 -; GFX10-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX10-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX10-NEXT: s_mov_b32 s6, exec_lo @@ -2694,7 +3704,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX10-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v6 -; GFX10-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX10-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 @@ -2708,8 +3718,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB8_4 -; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX10-NEXT: s_cbranch_execnz .LBB12_4 +; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -2719,13 +3729,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB8_3 +; GFX10-NEXT: s_cbranch_execnz .LBB12_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4 @@ -2736,7 +3746,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v8, s4 ; GFX90A-NEXT: v_not_b32_e32 v10, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 @@ -2748,14 +3758,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_max_f16_e32 v11, v5, v5 -; GFX90A-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX90A-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 -; GFX90A-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX90A-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v8, v7 ; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 @@ -2764,7 +3774,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX90A-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 @@ -2777,8 +3787,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB8_4 -; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX90A-NEXT: s_cbranch_execnz .LBB12_4 +; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -2786,13 +3796,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB8_3 +; GFX90A-NEXT: s_cbranch_execnz .LBB12_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4 @@ -2803,7 +3813,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v7, s4 ; GFX908-NEXT: v_not_b32_e32 v9, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 ; GFX908-NEXT: v_readfirstlane_b32 s10, v2 @@ -2815,14 +3825,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB8_1 +; GFX908-NEXT: s_cbranch_execnz .LBB12_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_max_f16_e32 v10, v5, v5 -; GFX908-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX908-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX908-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX908-NEXT: v_max_f16_e32 v4, v4, v4 @@ -2832,7 +3842,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec ; GFX908-NEXT: v_mov_b32_e32 v5, v6 -; GFX908-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX908-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 @@ -2845,8 +3855,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB8_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX908-NEXT: s_cbranch_execnz .LBB12_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -2854,13 +3864,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB8_3 +; GFX908-NEXT: s_cbranch_execnz .LBB12_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4 @@ -2871,7 +3881,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v7, s4 ; GFX8-NEXT: v_not_b32_e32 v9, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 ; GFX8-NEXT: v_readfirstlane_b32 s10, v2 @@ -2883,14 +3893,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB8_1 +; GFX8-NEXT: s_cbranch_execnz .LBB12_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_max_f16_e32 v10, v5, v5 -; GFX8-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX8-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX8-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX8-NEXT: v_max_f16_e32 v4, v4, v4 @@ -2901,7 +3911,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec ; GFX8-NEXT: v_mov_b32_e32 v5, v6 -; GFX8-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX8-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 @@ -2914,8 +3924,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB8_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX8-NEXT: s_cbranch_execnz .LBB12_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -2923,13 +3933,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB8_3 +; GFX8-NEXT: s_cbranch_execnz .LBB12_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 @@ -2939,7 +3949,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 ; GFX7-NEXT: v_not_b32_e32 v9, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 ; GFX7-NEXT: v_readfirstlane_b32 s10, v2 @@ -2950,15 +3960,15 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7-NEXT: s_cbranch_execnz .LBB12_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v5 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v4 -; GFX7-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX7-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 -; GFX7-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX7-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 @@ -2970,7 +3980,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: v_mov_b32_e32 v5, v6 -; GFX7-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX7-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 @@ -2983,8 +3993,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB8_4 -; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX7-NEXT: s_cbranch_execnz .LBB12_4 +; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -2992,14 +4002,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB8_3 +; GFX7-NEXT: s_cbranch_execnz .LBB12_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 @@ -3009,7 +4019,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 ; GFX6-NEXT: v_not_b32_e32 v9, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 ; GFX6-NEXT: v_readfirstlane_b32 s10, v2 @@ -3020,15 +4030,15 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB8_1 +; GFX6-NEXT: s_cbranch_execnz .LBB12_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v5 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v4 -; GFX6-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX6-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 -; GFX6-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX6-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 @@ -3040,7 +4050,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: v_mov_b32_e32 v5, v6 -; GFX6-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX6-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 @@ -3053,8 +4063,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB8_4 -; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX6-NEXT: s_cbranch_execnz .LBB12_4 +; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -3062,7 +4072,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB8_3 +; GFX6-NEXT: s_cbranch_execnz .LBB12_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4 @@ -3070,7 +4080,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result } @@ -3078,8 +4088,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; bfloat ; -------------------------------------------------------------------- -define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset: +define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -3098,7 +4108,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7 ; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -3127,13 +4137,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7 ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB9_1 +; GFX12-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_addk_i32 s6, 0x200 @@ -3147,7 +4157,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -3169,13 +4179,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB9_1 +; GFX940-NEXT: s_cbranch_execnz .LBB13_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_addk_i32 s6, 0x200 @@ -3191,7 +4201,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7 ; GFX11-NEXT: s_not_b32 s6, s5 ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -3221,13 +4231,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB9_1 +; GFX11-NEXT: s_cbranch_execnz .LBB13_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s18, 0x200 @@ -3244,7 +4254,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7 ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX10-NEXT: s_not_b32 s6, s5 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -3267,13 +4277,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7 ; GFX10-NEXT: v_mov_b32_e32 v1, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB9_1 +; GFX10-NEXT: s_cbranch_execnz .LBB13_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s18, 0x200 @@ -3291,7 +4301,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX90A-NEXT: s_movk_i32 s12, 0x7fff -; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -3311,13 +4321,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s18, 0x200 @@ -3335,7 +4345,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX908-NEXT: s_movk_i32 s12, 0x7fff -; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -3356,13 +4366,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB9_1 +; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s18, 0x200 @@ -3379,7 +4389,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7 ; GFX8-NEXT: s_not_b32 s7, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -3403,13 +4413,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB9_1 +; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s18, 0x200 @@ -3427,7 +4437,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7 ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 @@ -3447,14 +4457,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB9_1 +; GFX7-NEXT: s_cbranch_execnz .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s18, 0x200 @@ -3472,7 +4482,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7 ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; GFX6-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 @@ -3493,7 +4503,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB9_1 +; GFX6-NEXT: s_cbranch_execnz .LBB13_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 @@ -3501,12 +4511,12 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } -define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset: +define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -3525,7 +4535,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7 ; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -3554,12 +4564,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7 ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB10_1 +; GFX12-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_addk_i32 s6, 0x200 @@ -3573,7 +4583,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -3595,12 +4605,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB10_1 +; GFX940-NEXT: s_cbranch_execnz .LBB14_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_addk_i32 s6, 0x200 @@ -3616,7 +4626,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7 ; GFX11-NEXT: s_not_b32 s6, s5 ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -3646,12 +4656,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB10_1 +; GFX11-NEXT: s_cbranch_execnz .LBB14_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s18, 0x200 @@ -3668,7 +4678,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7 ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX10-NEXT: s_not_b32 s6, s5 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -3691,12 +4701,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB10_1 +; GFX10-NEXT: s_cbranch_execnz .LBB14_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s18, 0x200 @@ -3714,7 +4724,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX90A-NEXT: s_movk_i32 s12, 0x7fff -; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -3734,12 +4744,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s18, 0x200 @@ -3757,7 +4767,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX908-NEXT: s_movk_i32 s12, 0x7fff -; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -3778,12 +4788,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB10_1 +; GFX908-NEXT: s_cbranch_execnz .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s18, 0x200 @@ -3800,7 +4810,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7 ; GFX8-NEXT: s_not_b32 s7, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -3824,12 +4834,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB10_1 +; GFX8-NEXT: s_cbranch_execnz .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s18, 0x200 @@ -3847,7 +4857,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7 ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 @@ -3867,12 +4877,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB10_1 +; GFX7-NEXT: s_cbranch_execnz .LBB14_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s18, 0x200 @@ -3890,7 +4900,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7 ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 @@ -3911,18 +4921,18 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v1, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB10_1 +; GFX6-NEXT: s_cbranch_execnz .LBB14_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fmax ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr addrspace(7) %ptr, bfloat %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall: +define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, bfloat %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -3938,7 +4948,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff ; GFX12-NEXT: v_not_b32_e32 v9, v6 -; GFX12-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 @@ -3951,14 +4961,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB11_1 +; GFX12-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX12-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX12-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX12-NEXT: s_mov_b32 s2, exec_lo @@ -3980,7 +4990,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX12-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 @@ -3995,8 +5005,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB11_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX12-NEXT: s_cbranch_execnz .LBB15_4 +; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -4005,13 +5015,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB11_3 +; GFX12-NEXT: s_cbranch_execnz .LBB15_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4 @@ -4022,7 +5032,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v8, s0 ; GFX940-NEXT: v_not_b32_e32 v10, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 ; GFX940-NEXT: v_readfirstlane_b32 s6, v2 @@ -4034,15 +5044,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB11_1 +; GFX940-NEXT: s_cbranch_execnz .LBB15_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v11, 16, v5 ; GFX940-NEXT: s_movk_i32 s10, 0x7fff -; GFX940-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX940-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX940-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX940-NEXT: s_mov_b64 s[8:9], exec @@ -4057,7 +5067,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX940-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX940-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] -; GFX940-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX940-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 @@ -4071,8 +5081,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB11_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX940-NEXT: s_cbranch_execnz .LBB15_4 +; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -4080,13 +5090,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB11_3 +; GFX940-NEXT: s_cbranch_execnz .LBB15_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 @@ -4099,7 +5109,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff ; GFX11-NEXT: v_not_b32_e32 v9, v6 -; GFX11-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 ; GFX11-NEXT: v_readfirstlane_b32 s6, v2 @@ -4112,15 +5122,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB11_1 +; GFX11-NEXT: s_cbranch_execnz .LBB15_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX11-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX11-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX11-NEXT: s_mov_b32 s2, exec_lo @@ -4142,7 +5152,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v4, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX11-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 @@ -4157,8 +5167,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB11_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX11-NEXT: s_cbranch_execnz .LBB15_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -4168,14 +5178,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB11_3 +; GFX11-NEXT: s_cbranch_execnz .LBB15_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 @@ -4186,7 +5196,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v6 ; GFX10-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff ; GFX10-NEXT: v_not_b32_e32 v9, v6 -; GFX10-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 ; GFX10-NEXT: v_readfirstlane_b32 s10, v2 @@ -4198,13 +5208,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB11_1 +; GFX10-NEXT: s_cbranch_execnz .LBB15_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX10-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX10-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 -; GFX10-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX10-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_mov_b32 s6, exec_lo @@ -4219,7 +5229,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX10-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v6 -; GFX10-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX10-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 @@ -4233,8 +5243,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB11_4 -; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX10-NEXT: s_cbranch_execnz .LBB15_4 +; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -4244,13 +5254,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB11_3 +; GFX10-NEXT: s_cbranch_execnz .LBB15_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4 @@ -4261,7 +5271,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v8, s4 ; GFX90A-NEXT: v_not_b32_e32 v10, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 @@ -4273,15 +5283,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v11, 16, v5 ; GFX90A-NEXT: s_movk_i32 s14, 0x7fff -; GFX90A-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX90A-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 -; GFX90A-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX90A-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX90A-NEXT: v_max_f32_e32 v4, v4, v11 @@ -4294,7 +5304,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX90A-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 @@ -4307,8 +5317,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB11_4 -; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX90A-NEXT: s_cbranch_execnz .LBB15_4 +; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -4316,13 +5326,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB11_3 +; GFX90A-NEXT: s_cbranch_execnz .LBB15_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4 @@ -4333,7 +5343,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v7, s4 ; GFX908-NEXT: v_not_b32_e32 v9, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 ; GFX908-NEXT: v_readfirstlane_b32 s10, v2 @@ -4345,15 +5355,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB11_1 +; GFX908-NEXT: s_cbranch_execnz .LBB15_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; GFX908-NEXT: s_movk_i32 s14, 0x7fff -; GFX908-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX908-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX908-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX908-NEXT: v_max_f32_e32 v4, v4, v10 @@ -4367,7 +5377,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec ; GFX908-NEXT: v_mov_b32_e32 v5, v6 -; GFX908-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX908-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 @@ -4380,8 +5390,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB11_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX908-NEXT: s_cbranch_execnz .LBB15_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -4389,13 +5399,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB11_3 +; GFX908-NEXT: s_cbranch_execnz .LBB15_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4 @@ -4406,7 +5416,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v7, s4 ; GFX8-NEXT: v_not_b32_e32 v9, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 ; GFX8-NEXT: v_readfirstlane_b32 s10, v2 @@ -4418,14 +5428,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB11_1 +; GFX8-NEXT: s_cbranch_execnz .LBB15_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX8-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX8-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX8-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_max_f32_e32 v4, v4, v10 @@ -4441,7 +5451,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec ; GFX8-NEXT: v_mov_b32_e32 v5, v6 -; GFX8-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX8-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 @@ -4454,8 +5464,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB11_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX8-NEXT: s_cbranch_execnz .LBB15_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -4463,13 +5473,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB11_3 +; GFX8-NEXT: s_cbranch_execnz .LBB15_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 @@ -4479,7 +5489,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 ; GFX7-NEXT: v_not_b32_e32 v9, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 ; GFX7-NEXT: v_readfirstlane_b32 s10, v2 @@ -4490,15 +5500,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB11_1 +; GFX7-NEXT: s_cbranch_execnz .LBB15_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v5 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 -; GFX7-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX7-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 -; GFX7-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX7-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -4511,7 +5521,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_mov_b64 s[12:13], exec ; GFX7-NEXT: v_mov_b32_e32 v5, v6 -; GFX7-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX7-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 @@ -4524,8 +5534,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB11_4 -; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX7-NEXT: s_cbranch_execnz .LBB15_4 +; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -4533,14 +5543,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB11_3 +; GFX7-NEXT: s_cbranch_execnz .LBB15_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 @@ -4550,7 +5560,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 ; GFX6-NEXT: v_not_b32_e32 v9, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 ; GFX6-NEXT: v_readfirstlane_b32 s10, v2 @@ -4561,15 +5571,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB11_1 +; GFX6-NEXT: s_cbranch_execnz .LBB15_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v5 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 ; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 -; GFX6-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX6-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 -; GFX6-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX6-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -4582,7 +5592,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_mov_b64 s[12:13], exec ; GFX6-NEXT: v_mov_b32_e32 v5, v6 -; GFX6-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX6-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 @@ -4595,8 +5605,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB11_4 -; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX6-NEXT: s_cbranch_execnz .LBB15_4 +; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -4604,7 +5614,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB11_3 +; GFX6-NEXT: s_cbranch_execnz .LBB15_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4 @@ -4612,7 +5622,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } @@ -4620,8 +5630,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; <2 x half> ; -------------------------------------------------------------------- -define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset: +define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -4635,7 +5645,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp ; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1 ; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v5, v0 @@ -4652,12 +5662,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB12_1 +; GFX12-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, v0 @@ -4667,7 +5677,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 -; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v0 @@ -4682,12 +5692,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB12_1 +; GFX940-NEXT: s_cbranch_execnz .LBB16_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 @@ -4697,7 +5707,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp ; GFX11-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 @@ -4715,12 +5725,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB12_1 +; GFX11-NEXT: s_cbranch_execnz .LBB16_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 @@ -4734,7 +5744,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp ; GFX10-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 @@ -4750,12 +5760,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB12_1 +; GFX10-NEXT: s_cbranch_execnz .LBB16_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 @@ -4769,7 +5779,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 -; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 @@ -4782,12 +5792,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 @@ -4801,7 +5811,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 -; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v5, v0 @@ -4815,12 +5825,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB12_1 +; GFX908-NEXT: s_cbranch_execnz .LBB16_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -4835,7 +5845,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp ; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v3, v1, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -4852,12 +5862,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB12_1 +; GFX8-NEXT: s_cbranch_execnz .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s11, s17 @@ -4877,7 +5887,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX7-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -4902,12 +5912,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB12_1 +; GFX7-NEXT: s_cbranch_execnz .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s17 @@ -4927,7 +5937,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX6-NEXT: v_mov_b32_e32 v4, s6 -; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -4953,18 +5963,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB12_1 +; GFX6-NEXT: s_cbranch_execnz .LBB16_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } -define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset: +define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -4977,7 +5987,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace( ; GFX12-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 -; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v0, v1, v1 @@ -4994,12 +6004,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace( ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB13_1 +; GFX12-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, s6 @@ -5008,7 +6018,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace( ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v0, v0 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 -; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_pk_max_f16 v0, v1, v1 @@ -5023,12 +6033,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace( ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB13_1 +; GFX940-NEXT: s_cbranch_execnz .LBB17_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, s6 @@ -5037,7 +6047,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace( ; GFX11-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 -; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v0, v1, v1 @@ -5054,12 +6064,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace( ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB13_1 +; GFX11-NEXT: s_cbranch_execnz .LBB17_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s18 @@ -5072,7 +6082,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace( ; GFX10-NEXT: v_pk_max_f16 v2, v0, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v0, v1, v1 @@ -5088,12 +6098,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace( ; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB13_1 +; GFX10-NEXT: s_cbranch_execnz .LBB17_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_mov_b32 s11, s17 @@ -5106,7 +6116,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace( ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v0, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 -; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 @@ -5119,12 +6129,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace( ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_mov_b32 s11, s17 @@ -5137,7 +6147,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace( ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v0, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 -; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v0, v1, v1 @@ -5151,12 +6161,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace( ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB13_1 +; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s11, s17 @@ -5170,7 +6180,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace( ; GFX8-NEXT: v_max_f16_sdwa v2, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -5187,12 +6197,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace( ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB13_1 +; GFX8-NEXT: s_cbranch_execnz .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s11, s17 @@ -5212,7 +6222,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace( ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 @@ -5237,12 +6247,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace( ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB13_1 +; GFX7-NEXT: s_cbranch_execnz .LBB17_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s17 @@ -5262,7 +6272,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace( ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 ; GFX6-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 @@ -5288,18 +6298,18 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace( ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB13_1 +; GFX6-NEXT: s_cbranch_execnz .LBB17_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall(ptr addrspace(7) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall: +define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -5308,7 +6318,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 @@ -5322,14 +6332,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB14_1 +; GFX12-NEXT: s_cbranch_execnz .LBB18_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_pk_max_num_f16 v8, v5, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX12-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX12-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v4, v6, v6 ; GFX12-NEXT: s_mov_b32 s2, exec_lo @@ -5339,7 +6349,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX12-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 @@ -5354,8 +6364,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB14_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX12-NEXT: s_cbranch_execnz .LBB18_4 +; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -5364,18 +6374,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB14_3 +; GFX12-NEXT: s_cbranch_execnz .LBB18_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 ; GFX940-NEXT: v_readfirstlane_b32 s6, v2 @@ -5388,21 +6398,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 ; GFX940-NEXT: ; implicit-def: $vgpr4 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB14_1 +; GFX940-NEXT: s_cbranch_execnz .LBB18_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 ; GFX940-NEXT: v_pk_max_f16 v9, v5, v5 -; GFX940-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX940-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX940-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_pk_max_f16 v4, v7, v7 ; GFX940-NEXT: s_mov_b64 s[8:9], exec ; GFX940-NEXT: v_pk_max_f16 v6, v4, v9 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] -; GFX940-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX940-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 @@ -5416,8 +6426,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB14_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX940-NEXT: s_cbranch_execnz .LBB18_4 +; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -5425,19 +6435,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB14_3 +; GFX940-NEXT: s_cbranch_execnz .LBB18_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 ; GFX11-NEXT: v_readfirstlane_b32 s6, v2 @@ -5451,14 +6461,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 ; GFX11-NEXT: ; implicit-def: $vgpr4 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-NEXT: s_cbranch_execnz .LBB18_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_pk_max_f16 v8, v5, v5 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX11-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX11-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v4, v6, v6 ; GFX11-NEXT: s_mov_b32 s2, exec_lo @@ -5467,7 +6477,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX11-NEXT: v_pk_max_f16 v5, v4, v8 ; GFX11-NEXT: v_mov_b32_e32 v4, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX11-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 @@ -5482,8 +6492,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB14_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX11-NEXT: s_cbranch_execnz .LBB18_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -5493,19 +6503,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB14_3 +; GFX11-NEXT: s_cbranch_execnz .LBB18_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_mov_b32 s6, exec_lo -; GFX10-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 ; GFX10-NEXT: v_readfirstlane_b32 s10, v2 @@ -5518,13 +6528,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX10-NEXT: ; implicit-def: $vgpr4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB14_1 +; GFX10-NEXT: s_cbranch_execnz .LBB18_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: v_pk_max_f16 v8, v5, v5 -; GFX10-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX10-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 -; GFX10-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX10-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v4, v6, v6 ; GFX10-NEXT: s_mov_b32 s6, exec_lo @@ -5532,7 +6542,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX10-NEXT: v_pk_max_f16 v5, v4, v8 ; GFX10-NEXT: v_mov_b32_e32 v4, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v6 -; GFX10-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX10-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 @@ -5546,8 +6556,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB14_4 -; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX10-NEXT: s_cbranch_execnz .LBB18_4 +; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -5557,18 +6567,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB14_3 +; GFX10-NEXT: s_cbranch_execnz .LBB18_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 @@ -5581,20 +6591,20 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX90A-NEXT: ; implicit-def: $vgpr4 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_pk_max_f16 v9, v5, v5 -; GFX90A-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX90A-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 -; GFX90A-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX90A-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v4, v7, v7 ; GFX90A-NEXT: v_pk_max_f16 v6, v4, v9 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX90A-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 @@ -5607,8 +6617,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB14_4 -; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX90A-NEXT: s_cbranch_execnz .LBB18_4 +; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -5616,18 +6626,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB14_3 +; GFX90A-NEXT: s_cbranch_execnz .LBB18_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 ; GFX908-NEXT: v_readfirstlane_b32 s10, v2 @@ -5640,21 +6650,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX908-NEXT: ; implicit-def: $vgpr4 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB14_1 +; GFX908-NEXT: s_cbranch_execnz .LBB18_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_pk_max_f16 v8, v5, v5 -; GFX908-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX908-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX908-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v4, v6, v6 ; GFX908-NEXT: v_pk_max_f16 v5, v4, v8 ; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec ; GFX908-NEXT: v_mov_b32_e32 v5, v6 -; GFX908-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX908-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 @@ -5667,8 +6677,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB14_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX908-NEXT: s_cbranch_execnz .LBB18_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -5676,18 +6686,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB14_3 +; GFX908-NEXT: s_cbranch_execnz .LBB18_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 ; GFX8-NEXT: v_readfirstlane_b32 s10, v2 @@ -5700,15 +6710,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX8-NEXT: ; implicit-def: $vgpr4 ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB14_1 +; GFX8-NEXT: s_cbranch_execnz .LBB18_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_max_f16_sdwa v8, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v9, v5, v5 -; GFX8-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX8-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX8-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 @@ -5718,7 +6728,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec ; GFX8-NEXT: v_mov_b32_e32 v5, v6 -; GFX8-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX8-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 @@ -5731,8 +6741,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB14_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX8-NEXT: s_cbranch_execnz .LBB18_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -5740,18 +6750,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB14_3 +; GFX8-NEXT: s_cbranch_execnz .LBB18_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 ; GFX7-NEXT: v_readfirstlane_b32 s10, v2 @@ -5763,7 +6773,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX7-NEXT: ; implicit-def: $vgpr4 ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB14_1 +; GFX7-NEXT: s_cbranch_execnz .LBB18_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 @@ -5775,9 +6785,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v8 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX7-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 -; GFX7-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX7-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: s_mov_b64 s[12:13], exec @@ -5793,7 +6803,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX7-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 @@ -5806,8 +6816,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB14_4 -; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX7-NEXT: s_cbranch_execnz .LBB18_4 +; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 @@ -5817,19 +6827,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB14_3 +; GFX7-NEXT: s_cbranch_execnz .LBB18_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_mov_b32_e32 v0, v4 ; GFX7-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 ; GFX6-NEXT: v_readfirstlane_b32 s10, v2 @@ -5841,7 +6851,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX6-NEXT: ; implicit-def: $vgpr4 ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB14_1 +; GFX6-NEXT: s_cbranch_execnz .LBB18_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 @@ -5853,9 +6863,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v11, v8 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 -; GFX6-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX6-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 -; GFX6-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX6-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: s_mov_b64 s[12:13], exec @@ -5872,7 +6882,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX6-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 @@ -5885,8 +6895,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB14_4 -; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX6-NEXT: s_cbranch_execnz .LBB18_4 +; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 @@ -5896,7 +6906,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB14_3 +; GFX6-NEXT: s_cbranch_execnz .LBB18_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v0, v4 @@ -5904,7 +6914,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } @@ -5912,8 +6922,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; <2 x bfloat> ; -------------------------------------------------------------------- -define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset: +define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -5928,7 +6938,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v0 @@ -5961,12 +6971,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB15_1 +; GFX12-NEXT: s_cbranch_execnz .LBB19_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, v0 @@ -5979,7 +6989,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add ; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX940-NEXT: s_mov_b32 s9, 0x7060302 ; GFX940-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v0 @@ -6006,12 +7016,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_cbranch_execnz .LBB15_1 +; GFX940-NEXT: s_cbranch_execnz .LBB19_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 @@ -6024,7 +7034,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v0 @@ -6058,13 +7068,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB15_1 +; GFX11-NEXT: s_cbranch_execnz .LBB19_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 @@ -6079,7 +7089,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, s4 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v0 @@ -6108,12 +7118,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB15_1 +; GFX10-NEXT: s_cbranch_execnz .LBB19_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 @@ -6130,7 +7140,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add ; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s13, 0x7060302 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v0 @@ -6156,12 +7166,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB19_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 @@ -6178,7 +7188,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add ; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s13, 0x7060302 ; GFX908-NEXT: v_mov_b32_e32 v4, s4 -; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v0 @@ -6205,12 +7215,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB15_1 +; GFX908-NEXT: s_cbranch_execnz .LBB19_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -6225,7 +7235,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -6255,12 +7265,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB15_1 +; GFX8-NEXT: s_cbranch_execnz .LBB19_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s11, s17 @@ -6279,7 +7289,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX7-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -6301,12 +7311,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB15_1 +; GFX7-NEXT: s_cbranch_execnz .LBB19_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s17 @@ -6325,7 +7335,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX6-NEXT: v_mov_b32_e32 v4, s6 -; GFX6-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -6348,18 +7358,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB15_1 +; GFX6-NEXT: s_cbranch_execnz .LBB19_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } -define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset: +define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -6372,7 +7382,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace ; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 ; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 @@ -6403,12 +7413,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB16_1 +; GFX12-NEXT: s_cbranch_execnz .LBB20_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, s6 @@ -6420,7 +7430,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace ; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX940-NEXT: s_mov_b32 s9, 0x7060302 ; GFX940-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -6447,12 +7457,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v1, v6 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_cbranch_execnz .LBB16_1 +; GFX940-NEXT: s_cbranch_execnz .LBB20_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_lshlrev_b32 v2, 16, v0 @@ -6463,7 +7473,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 @@ -6495,13 +7505,13 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB16_1 +; GFX11-NEXT: s_cbranch_execnz .LBB20_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s18 @@ -6515,7 +7525,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, s4 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -6544,12 +7554,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace ; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB16_1 +; GFX10-NEXT: s_cbranch_execnz .LBB20_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_mov_b32 s11, s17 @@ -6565,7 +7575,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace ; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX90A-NEXT: s_mov_b32 s13, 0x7060302 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -6591,12 +7601,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_mov_b32 s11, s17 @@ -6612,7 +7622,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace ; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX908-NEXT: s_mov_b32 s13, 0x7060302 ; GFX908-NEXT: v_mov_b32_e32 v4, s4 -; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -6639,12 +7649,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB16_1 +; GFX908-NEXT: s_cbranch_execnz .LBB20_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s11, s17 @@ -6658,7 +7668,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -6688,12 +7698,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB16_1 +; GFX8-NEXT: s_cbranch_execnz .LBB20_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s11, s17 @@ -6712,7 +7722,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -6734,12 +7744,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB16_1 +; GFX7-NEXT: s_cbranch_execnz .LBB20_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s17 @@ -6758,7 +7768,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX6-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -6781,18 +7791,18 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB16_1 +; GFX6-NEXT: s_cbranch_execnz .LBB20_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fmax ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall(ptr addrspace(7) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall: +define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -6801,7 +7811,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 @@ -6815,15 +7825,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB17_1 +; GFX12-NEXT: s_cbranch_execnz .LBB21_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX12-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX12-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX12-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v6 @@ -6847,7 +7857,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX12-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 @@ -6862,8 +7872,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB17_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX12-NEXT: s_cbranch_execnz .LBB21_4 +; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -6872,18 +7882,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB17_3 +; GFX12-NEXT: s_cbranch_execnz .LBB21_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 ; GFX940-NEXT: v_readfirstlane_b32 s6, v2 @@ -6896,7 +7906,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 ; GFX940-NEXT: ; implicit-def: $vgpr4 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB17_1 +; GFX940-NEXT: s_cbranch_execnz .LBB21_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 @@ -6904,9 +7914,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX940-NEXT: s_movk_i32 s10, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 ; GFX940-NEXT: s_mov_b32 s11, 0x7060302 -; GFX940-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX940-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX940-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v7 ; GFX940-NEXT: v_max_f32_e32 v4, v4, v9 @@ -6927,7 +7937,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc ; GFX940-NEXT: v_perm_b32 v6, v5, v4, s11 ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] -; GFX940-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX940-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 @@ -6941,8 +7951,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB17_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX940-NEXT: s_cbranch_execnz .LBB21_4 +; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -6950,19 +7960,19 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB17_3 +; GFX940-NEXT: s_cbranch_execnz .LBB21_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 ; GFX11-NEXT: v_readfirstlane_b32 s6, v2 @@ -6976,16 +7986,16 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 ; GFX11-NEXT: ; implicit-def: $vgpr4 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB17_1 +; GFX11-NEXT: s_cbranch_execnz .LBB21_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX11-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX11-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6 @@ -7009,7 +8019,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX11-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 ; GFX11-NEXT: v_mov_b32_e32 v4, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX11-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 @@ -7024,8 +8034,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB17_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX11-NEXT: s_cbranch_execnz .LBB21_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -7035,20 +8045,20 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB17_3 +; GFX11-NEXT: s_cbranch_execnz .LBB21_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_mov_b32 s6, exec_lo -; GFX10-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 ; GFX10-NEXT: v_readfirstlane_b32 s10, v2 @@ -7061,14 +8071,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX10-NEXT: ; implicit-def: $vgpr4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB17_1 +; GFX10-NEXT: s_cbranch_execnz .LBB21_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; GFX10-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX10-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 -; GFX10-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX10-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 @@ -7089,7 +8099,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX10-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v4, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v6 -; GFX10-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX10-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 @@ -7103,8 +8113,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB17_4 -; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX10-NEXT: s_cbranch_execnz .LBB21_4 +; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -7114,18 +8124,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB17_3 +; GFX10-NEXT: s_cbranch_execnz .LBB21_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 @@ -7138,7 +8148,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX90A-NEXT: ; implicit-def: $vgpr4 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 @@ -7146,9 +8156,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX90A-NEXT: s_movk_i32 s14, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 ; GFX90A-NEXT: s_mov_b32 s15, 0x7060302 -; GFX90A-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX90A-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 -; GFX90A-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX90A-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v7 ; GFX90A-NEXT: v_max_f32_e32 v4, v4, v9 @@ -7167,7 +8177,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX90A-NEXT: v_perm_b32 v6, v5, v4, s15 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX90A-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 @@ -7180,8 +8190,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB17_4 -; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX90A-NEXT: s_cbranch_execnz .LBB21_4 +; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -7189,18 +8199,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB17_3 +; GFX90A-NEXT: s_cbranch_execnz .LBB21_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 ; GFX908-NEXT: v_readfirstlane_b32 s10, v2 @@ -7213,7 +8223,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX908-NEXT: ; implicit-def: $vgpr4 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB17_1 +; GFX908-NEXT: s_cbranch_execnz .LBB21_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 @@ -7221,9 +8231,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX908-NEXT: s_movk_i32 s14, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX908-NEXT: s_mov_b32 s15, 0x7060302 -; GFX908-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX908-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX908-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX908-NEXT: v_max_f32_e32 v4, v4, v8 @@ -7243,7 +8253,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec ; GFX908-NEXT: v_mov_b32_e32 v5, v6 -; GFX908-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX908-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 @@ -7256,8 +8266,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB17_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX908-NEXT: s_cbranch_execnz .LBB21_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -7265,18 +8275,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB17_3 +; GFX908-NEXT: s_cbranch_execnz .LBB21_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 ; GFX8-NEXT: v_readfirstlane_b32 s10, v2 @@ -7289,15 +8299,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX8-NEXT: ; implicit-def: $vgpr4 ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB17_1 +; GFX8-NEXT: s_cbranch_execnz .LBB21_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; GFX8-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX8-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX8-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX8-NEXT: v_max_f32_e32 v4, v4, v8 @@ -7320,7 +8330,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec ; GFX8-NEXT: v_mov_b32_e32 v5, v6 -; GFX8-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX8-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 @@ -7333,8 +8343,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB17_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX8-NEXT: s_cbranch_execnz .LBB21_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -7342,18 +8352,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB17_3 +; GFX8-NEXT: s_cbranch_execnz .LBB21_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 ; GFX7-NEXT: v_readfirstlane_b32 s10, v2 @@ -7365,7 +8375,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX7-NEXT: ; implicit-def: $vgpr4 ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB17_1 +; GFX7-NEXT: s_cbranch_execnz .LBB21_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 @@ -7376,9 +8386,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 -; GFX7-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX7-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 -; GFX7-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX7-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v7 @@ -7392,7 +8402,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: s_mov_b64 s[12:13], exec ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX7-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 @@ -7405,8 +8415,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB17_4 -; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX7-NEXT: s_cbranch_execnz .LBB21_4 +; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 @@ -7415,19 +8425,19 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v6 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB17_3 +; GFX7-NEXT: s_cbranch_execnz .LBB21_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_mov_b32_e32 v0, v7 ; GFX7-NEXT: v_mov_b32_e32 v1, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 ; GFX6-NEXT: v_readfirstlane_b32 s10, v2 @@ -7439,7 +8449,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX6-NEXT: ; implicit-def: $vgpr4 ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB17_1 +; GFX6-NEXT: s_cbranch_execnz .LBB21_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v6 @@ -7450,9 +8460,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX6-NEXT: s_mov_b64 s[6:7], 0 ; GFX6-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 ; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 -; GFX6-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX6-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 -; GFX6-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX6-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v7 @@ -7466,7 +8476,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: s_mov_b64 s[12:13], exec ; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX6-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 @@ -7479,8 +8489,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB17_4 -; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX6-NEXT: s_cbranch_execnz .LBB21_4 +; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 @@ -7490,14 +8500,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v6 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB17_3 +; GFX6-NEXT: s_cbranch_execnz .LBB21_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v0, v7 ; GFX6-NEXT: v_mov_b32_e32 v1, v4 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } @@ -7505,43 +8515,23 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; misc ; -------------------------------------------------------------------- -define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset: +define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_max_num_f32 v2, v1, v1 -; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: v_mov_b32_e32 v1, s6 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: v_max_num_f32_e32 v0, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v4, v0, v2 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB18_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset: +; GFX940-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, v0 @@ -7551,7 +8541,7 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 -; GFX940-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v0 @@ -7565,79 +8555,38 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB18_1 +; GFX940-NEXT: s_cbranch_execnz .LBB22_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset: +; GFX11-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1 -; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_mov_b32_e32 v1, s6 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX11-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc +; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB18_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset: +; GFX10-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 +; GFX10-NEXT: v_mov_b32_e32 v1, s18 ; GFX10-NEXT: s_mov_b32 s11, s17 ; GFX10-NEXT: s_mov_b32 s10, s16 ; GFX10-NEXT: s_mov_b32 s9, s7 ; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX10-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB18_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX10-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB18_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset: +; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 @@ -7651,7 +8600,7 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 -; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 @@ -7666,12 +8615,12 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset: +; GFX908-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 @@ -7685,7 +8634,7 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 -; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v5, v0 @@ -7699,12 +8648,12 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB18_1 +; GFX908-NEXT: s_cbranch_execnz .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset: +; GFX8-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -7718,7 +8667,7 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 -; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v5, v0 @@ -7732,83 +8681,42 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB18_1 +; GFX8-NEXT: s_cbranch_execnz .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset: +; GFX7-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v1, v0 ; GFX7-NEXT: s_mov_b32 s11, s17 ; GFX7-NEXT: s_mov_b32 s10, s16 ; GFX7-NEXT: s_mov_b32 s9, s7 ; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX7-NEXT: v_mov_b32_e32 v3, s6 -; GFX7-NEXT: .LBB18_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX7-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v0, v4 -; GFX7-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v1, s18 +; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB18_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset: +; GFX6-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, v0 ; GFX6-NEXT: s_mov_b32 s11, s17 ; GFX6-NEXT: s_mov_b32 s10, s16 ; GFX6-NEXT: s_mov_b32 s9, s7 ; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v0, s18 -; GFX6-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX6-NEXT: v_mov_b32_e32 v3, s6 -; GFX6-NEXT: .LBB18_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX6-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v0, v4 -; GFX6-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v1, s18 +; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB18_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, float %val seq_cst + %result = atomicrmw fmax ptr addrspace(7) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" } - +attributes #0 = { nounwind } +!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll index cd01cc7309fcd..61ee956747135 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll @@ -13,8 +13,8 @@ ; float ; -------------------------------------------------------------------- -define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset: +define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -29,7 +29,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, v0 @@ -58,7 +58,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, s6 @@ -69,7 +69,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s18 @@ -84,7 +84,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 @@ -116,7 +116,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 @@ -149,7 +149,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -182,7 +182,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s11, s17 @@ -195,7 +195,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s17 @@ -209,12 +209,12 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset: +define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -229,7 +229,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, s6 @@ -257,7 +257,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7) ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, s6 @@ -268,7 +268,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s18 @@ -283,7 +283,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_mov_b32 s11, s17 @@ -314,7 +314,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7) ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_mov_b32 s11, s17 @@ -346,7 +346,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7) ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s11, s17 @@ -378,7 +378,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7) ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s11, s17 @@ -391,7 +391,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s17 @@ -405,12 +405,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr addrspace(7) %ptr, float %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall: +define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -444,7 +444,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 @@ -505,7 +505,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad ; GFX940-NEXT: v_mov_b32_e32 v0, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s1, exec_lo @@ -535,7 +535,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s5, exec_lo @@ -564,7 +564,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 @@ -623,7 +623,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 @@ -683,7 +683,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4 @@ -743,7 +743,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b64 s[6:7], exec @@ -769,7 +769,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b64 s[6:7], exec @@ -796,90 +796,79 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -; -------------------------------------------------------------------- -; double -; -------------------------------------------------------------------- - -define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset(ptr addrspace(7) inreg %ptr, double %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset: +define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: v_mov_b32_e32 v1, s6 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB3_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 +; GFX940-NEXT: s_addk_i32 s6, 0x400 +; GFX940-NEXT: s_mov_b64 s[4:5], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, s6 +; GFX940-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX940-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX940-NEXT: s_cbranch_execnz .LBB3_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x800 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1 +; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX11-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] +; GFX11-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX11-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -888,64 +877,99 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset(ptr addrspace(7) ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s18 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s18 ; GFX10-NEXT: s_mov_b32 s11, s17 ; GFX10-NEXT: s_mov_b32 s10, s16 ; GFX10-NEXT: s_mov_b32 s9, s7 ; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_add_i32 s4, s18, 0x400 +; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX10-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX10-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB3_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_mov_b32 s11, s17 ; GFX90A-NEXT: s_mov_b32 s10, s16 ; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v2, s18 -; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX90A-NEXT: v_mov_b32_e32 v0, s18 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v1, v0 ; GFX908-NEXT: s_mov_b32 s11, s17 ; GFX908-NEXT: s_mov_b32 s10, s16 ; GFX908-NEXT: s_mov_b32 s9, s7 ; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX908-NEXT: s_add_i32 s6, s18, 0x800 +; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s6, s18, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s6 +; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v10, v1 -; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX908-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v7 -; GFX908-NEXT: v_mov_b32_e32 v1, v8 -; GFX908-NEXT: v_mov_b32_e32 v2, v9 -; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX908-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB3_1 @@ -953,36 +977,32 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset(ptr addrspace(7) ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: s_mov_b32 s11, s17 ; GFX8-NEXT: s_mov_b32 s10, s16 ; GFX8-NEXT: s_mov_b32 s9, s7 ; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX8-NEXT: s_add_i32 s6, s18, 0x800 +; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s6, s18, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v10, v1 -; GFX8-NEXT: v_mov_b32_e32 v9, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX8-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v7 -; GFX8-NEXT: v_mov_b32_e32 v1, v8 -; GFX8-NEXT: v_mov_b32_e32 v2, v9 -; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX8-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB3_1 @@ -990,669 +1010,1659 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset(ptr addrspace(7) ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, v0 ; GFX7-NEXT: s_mov_b32 s11, s17 ; GFX7-NEXT: s_mov_b32 s10, s16 ; GFX7-NEXT: s_mov_b32 s9, s7 ; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX7-NEXT: v_mov_b32_e32 v0, s18 +; GFX7-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v1 +; GFX7-NEXT: v_mov_b32_e32 v3, s6 +; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v5, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX7-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB3_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, v0 ; GFX6-NEXT: s_mov_b32 s11, s17 ; GFX6-NEXT: s_mov_b32 s10, s16 ; GFX6-NEXT: s_mov_b32 s9, s7 ; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s18 +; GFX6-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v1 +; GFX6-NEXT: v_mov_b32_e32 v3, s6 +; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v5, v0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX6-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v0, v4 +; GFX6-NEXT: v_mov_b32_e32 v1, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB3_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst - ret double %result + %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 + ret float %result } -define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset(ptr addrspace(7) inreg %ptr, double %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset: +define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, s6 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_mov_b32_e32 v1, s6 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB4_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, s6 -; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX11-NEXT: s_add_i32 s4, s6, 0x800 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 -; GFX11-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 +; GFX940-NEXT: s_addk_i32 s6, 0x400 +; GFX940-NEXT: s_mov_b64 s[4:5], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, s6 +; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX940-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX940-NEXT: s_cbranch_execnz .LBB4_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s6 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 -; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc +; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB4_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s18 +; GFX10-NEXT: v_mov_b32_e32 v1, s18 ; GFX10-NEXT: s_mov_b32 s11, s17 ; GFX10-NEXT: s_mov_b32 s10, s16 ; GFX10-NEXT: s_mov_b32 s9, s7 ; GFX10-NEXT: s_mov_b32 s8, s6 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[8:11], 0 offen offset:1024 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_mov_b32 s11, s17 ; GFX90A-NEXT: s_mov_b32 s10, s16 ; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v2, s18 -; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 +; GFX90A-NEXT: v_mov_b32_e32 v0, s18 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v1, v0 ; GFX908-NEXT: s_mov_b32 s11, s17 ; GFX908-NEXT: s_mov_b32 s10, s16 ; GFX908-NEXT: s_mov_b32 s9, s7 ; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v2, s18 -; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v2, s[8:11], 0 offen offset:2048 -; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX908-NEXT: s_add_i32 s6, s18, 0x800 +; GFX908-NEXT: v_mov_b32_e32 v0, s18 +; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s6, s18, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s6 +; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v10, v3 -; GFX908-NEXT: v_mov_b32_e32 v9, v2 -; GFX908-NEXT: v_mov_b32_e32 v8, v1 -; GFX908-NEXT: v_mov_b32_e32 v7, v0 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX908-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] -; GFX908-NEXT: v_mov_b32_e32 v2, v7 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v8 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB4_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: s_mov_b32 s11, s17 ; GFX8-NEXT: s_mov_b32 s10, s16 ; GFX8-NEXT: s_mov_b32 s9, s7 ; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v2, s18 -; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v2, s[8:11], 0 offen offset:2048 -; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX8-NEXT: s_add_i32 s6, s18, 0x800 +; GFX8-NEXT: v_mov_b32_e32 v0, s18 +; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s6, s18, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v10, v3 -; GFX8-NEXT: v_mov_b32_e32 v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v8, v1 -; GFX8-NEXT: v_mov_b32_e32 v7, v0 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX8-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, v7 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v8 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB4_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s11, s17 ; GFX7-NEXT: s_mov_b32 s10, s16 ; GFX7-NEXT: s_mov_b32 s9, s7 ; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 +; GFX7-NEXT: v_mov_b32_e32 v1, s18 +; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[8:11], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s17 ; GFX6-NEXT: s_mov_b32 s10, s16 ; GFX6-NEXT: s_mov_b32 s9, s7 ; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 +; GFX6-NEXT: v_mov_b32_e32 v1, s18 +; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[8:11], 0 offen offset:1024 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst - ret void + %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 + ret float %result } -define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall(ptr addrspace(7) %ptr, double %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall: +; -------------------------------------------------------------------- +; double +; -------------------------------------------------------------------- + +define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, double %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 -; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 -; GFX12-NEXT: v_add_nc_u32_e32 v15, 0x800, v4 -; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_readfirstlane_b32 s4, v9 -; GFX12-NEXT: v_readfirstlane_b32 s5, v10 -; GFX12-NEXT: v_readfirstlane_b32 s6, v7 -; GFX12-NEXT: v_readfirstlane_b32 s7, v8 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 -; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048 -; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB5_1 -; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[5:6], v[5:6] -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB5_3: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB5_4 Depth 2 +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 +; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[13:14], v[13:14] -; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: v_min_num_f64_e32 v[11:12], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 -; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 -; GFX12-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 -; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX12-NEXT: v_readfirstlane_b32 s4, v9 -; GFX12-NEXT: v_readfirstlane_b32 s5, v10 -; GFX12-NEXT: v_readfirstlane_b32 s6, v7 -; GFX12-NEXT: v_readfirstlane_b32 s7, v8 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB5_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1 -; GFX12-NEXT: s_mov_b32 exec_lo, s2 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] -; GFX12-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB5_3 -; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_execnz .LBB5_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v6 -; GFX940-NEXT: v_mov_b32_e32 v6, v5 -; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_min_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0 -; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX940-NEXT: ; implicit-def: $vgpr4 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB5_1 -; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] +; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-NEXT: s_add_i32 s4, s6, 0x800 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 -; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x800, v4 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_readfirstlane_b32 s4, v9 -; GFX11-NEXT: v_readfirstlane_b32 s5, v10 -; GFX11-NEXT: v_readfirstlane_b32 s6, v7 -; GFX11-NEXT: v_readfirstlane_b32 s7, v8 -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048 -; GFX11-NEXT: ; implicit-def: $vgpr4 -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB5_1 -; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB5_3: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB5_4 Depth 2 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] -; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[11:12], v[0:1], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 -; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 -; GFX11-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 -; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX11-NEXT: v_readfirstlane_b32 s4, v9 -; GFX11-NEXT: v_readfirstlane_b32 s5, v10 -; GFX11-NEXT: v_readfirstlane_b32 s6, v7 -; GFX11-NEXT: v_readfirstlane_b32 s7, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB5_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1 -; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX11-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] -; GFX11-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB5_3 -; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB5_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s5, exec_lo +; GFX10-NEXT: v_mov_b32_e32 v2, s18 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_readfirstlane_b32 s8, v0 -; GFX10-NEXT: v_readfirstlane_b32 s9, v1 -; GFX10-NEXT: v_readfirstlane_b32 s10, v2 -; GFX10-NEXT: v_readfirstlane_b32 s11, v3 -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] -; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] -; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_fmin_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc -; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX10-NEXT: ; implicit-def: $vgpr4 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB5_1 -; GFX10-NEXT: ; %bb.2: -; GFX10-NEXT: s_mov_b32 exec_lo, s5 +; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v5 -; GFX10-NEXT: v_mov_b32_e32 v1, v6 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v6 -; GFX90A-NEXT: v_mov_b32_e32 v6, v5 -; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 -; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 -; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 -; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_min_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX90A-NEXT: ; implicit-def: $vgpr4 -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 -; GFX90A-NEXT: ; %bb.2: -; GFX90A-NEXT: s_mov_b64 exec, s[6:7] +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: v_mov_b32_e32 v2, s18 +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v8, v3 -; GFX908-NEXT: v_mov_b32_e32 v7, v2 +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s18 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: s_add_i32 s6, s18, 0x800 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s6 +; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v10, v1 ; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_add_u32_e32 v15, 0x800, v4 -; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_readfirstlane_b32 s8, v9 -; GFX908-NEXT: v_readfirstlane_b32 s9, v10 -; GFX908-NEXT: v_readfirstlane_b32 s10, v7 -; GFX908-NEXT: v_readfirstlane_b32 s11, v8 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] -; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] -; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 -; GFX908-NEXT: ; implicit-def: $vgpr4 -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB5_1 -; GFX908-NEXT: ; %bb.2: -; GFX908-NEXT: s_mov_b64 exec, s[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] -; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB5_3: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB5_4 Depth 2 +; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX908-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v7 +; GFX908-NEXT: v_mov_b32_e32 v1, v8 +; GFX908-NEXT: v_mov_b32_e32 v2, v9 +; GFX908-NEXT: v_mov_b32_e32 v3, v10 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] -; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_min_f64 v[11:12], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v11 -; GFX908-NEXT: v_mov_b32_e32 v1, v12 -; GFX908-NEXT: v_mov_b32_e32 v2, v13 -; GFX908-NEXT: v_mov_b32_e32 v3, v14 -; GFX908-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 -; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX908-NEXT: v_readfirstlane_b32 s8, v9 -; GFX908-NEXT: v_readfirstlane_b32 s9, v10 -; GFX908-NEXT: v_readfirstlane_b32 s10, v7 -; GFX908-NEXT: v_readfirstlane_b32 s11, v8 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] -; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] -; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB5_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s18 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: s_add_i32 s6, s18, 0x800 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v10, v1 +; GFX8-NEXT: v_mov_b32_e32 v9, v0 +; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX8-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v7 +; GFX8-NEXT: v_mov_b32_e32 v1, v8 +; GFX8-NEXT: v_mov_b32_e32 v2, v9 +; GFX8-NEXT: v_mov_b32_e32 v3, v10 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB5_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s18 +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s18 +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret double %result +} + +define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, double %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v2, s6 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] +; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048 +; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] +; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_execnz .LBB6_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v2, s6 +; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX11-NEXT: s_add_i32 s4, s6, 0x800 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 +; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] +; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB6_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v2, s18 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: v_mov_b32_e32 v2, s18 +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: v_mov_b32_e32 v2, s18 +; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v2, s[8:11], 0 offen offset:2048 +; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX908-NEXT: s_add_i32 s6, s18, 0x800 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s6 +; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v10, v3 +; GFX908-NEXT: v_mov_b32_e32 v9, v2 +; GFX908-NEXT: v_mov_b32_e32 v8, v1 +; GFX908-NEXT: v_mov_b32_e32 v7, v0 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v2, v7 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v3, v8 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB6_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s18 +; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v2, s[8:11], 0 offen offset:2048 +; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX8-NEXT: s_add_i32 s6, s18, 0x800 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v10, v3 +; GFX8-NEXT: v_mov_b32_e32 v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v8, v1 +; GFX8-NEXT: v_mov_b32_e32 v7, v0 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v2, v7 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, v8 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB6_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s18 +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s18 +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 + %unused = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, double %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 +; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: v_add_nc_u32_e32 v15, 0x800, v4 +; GFX12-NEXT: s_mov_b32 s1, exec_lo +; GFX12-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_readfirstlane_b32 s4, v9 +; GFX12-NEXT: v_readfirstlane_b32 s5, v10 +; GFX12-NEXT: v_readfirstlane_b32 s6, v7 +; GFX12-NEXT: v_readfirstlane_b32 s7, v8 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] +; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048 +; GFX12-NEXT: ; implicit-def: $vgpr4 +; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB7_1 +; GFX12-NEXT: ; %bb.2: +; GFX12-NEXT: s_mov_b32 exec_lo, s1 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[5:6], v[5:6] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB7_3: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Loop Header: Depth=1 +; GFX12-NEXT: ; Child Loop BB7_4 Depth 2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[13:14], v[13:14] +; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: v_min_num_f64_e32 v[11:12], v[0:1], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 +; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 +; GFX12-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 +; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX12-NEXT: v_readfirstlane_b32 s4, v9 +; GFX12-NEXT: v_readfirstlane_b32 s5, v10 +; GFX12-NEXT: v_readfirstlane_b32 s6, v7 +; GFX12-NEXT: v_readfirstlane_b32 s7, v8 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] +; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB7_4 +; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 +; GFX12-NEXT: s_mov_b32 exec_lo, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] +; GFX12-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB7_3 +; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, v6 +; GFX940-NEXT: v_mov_b32_e32 v6, v5 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_readfirstlane_b32 s4, v0 +; GFX940-NEXT: v_readfirstlane_b32 s5, v1 +; GFX940-NEXT: v_readfirstlane_b32 s6, v2 +; GFX940-NEXT: v_readfirstlane_b32 s7, v3 +; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_atomic_min_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX940-NEXT: ; implicit-def: $vgpr4 +; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB7_1 +; GFX940-NEXT: ; %bb.2: +; GFX940-NEXT: s_mov_b64 exec, s[2:3] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 +; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x800, v4 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_readfirstlane_b32 s4, v9 +; GFX11-NEXT: v_readfirstlane_b32 s5, v10 +; GFX11-NEXT: v_readfirstlane_b32 s6, v7 +; GFX11-NEXT: v_readfirstlane_b32 s7, v8 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] +; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB7_1 +; GFX11-NEXT: ; %bb.2: +; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB7_3: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Loop Header: Depth=1 +; GFX11-NEXT: ; Child Loop BB7_4 Depth 2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_min_f64 v[11:12], v[0:1], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 +; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 +; GFX11-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 +; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX11-NEXT: v_readfirstlane_b32 s4, v9 +; GFX11-NEXT: v_readfirstlane_b32 s5, v10 +; GFX11-NEXT: v_readfirstlane_b32 s6, v7 +; GFX11-NEXT: v_readfirstlane_b32 s7, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] +; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc +; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB7_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 +; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] +; GFX11-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB7_3 +; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s5, exec_lo +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_readfirstlane_b32 s8, v0 +; GFX10-NEXT: v_readfirstlane_b32 s9, v1 +; GFX10-NEXT: v_readfirstlane_b32 s10, v2 +; GFX10-NEXT: v_readfirstlane_b32 s11, v3 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] +; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] +; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_and_saveexec_b32 s4, s4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_atomic_fmin_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX10-NEXT: ; implicit-def: $vgpr4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB7_1 +; GFX10-NEXT: ; %bb.2: +; GFX10-NEXT: s_mov_b32 exec_lo, s5 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v5 +; GFX10-NEXT: v_mov_b32_e32 v1, v6 +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v6 +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: s_mov_b64 s[6:7], exec +; GFX90A-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 +; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 +; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 +; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_atomic_min_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr4 +; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 +; GFX90A-NEXT: ; %bb.2: +; GFX90A-NEXT: s_mov_b64 exec, s[6:7] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v8, v3 +; GFX908-NEXT: v_mov_b32_e32 v7, v2 +; GFX908-NEXT: v_mov_b32_e32 v10, v1 +; GFX908-NEXT: v_mov_b32_e32 v9, v0 +; GFX908-NEXT: v_add_u32_e32 v15, 0x800, v4 +; GFX908-NEXT: s_mov_b64 s[6:7], exec +; GFX908-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_readfirstlane_b32 s8, v9 +; GFX908-NEXT: v_readfirstlane_b32 s9, v10 +; GFX908-NEXT: v_readfirstlane_b32 s10, v7 +; GFX908-NEXT: v_readfirstlane_b32 s11, v8 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] +; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] +; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 +; GFX908-NEXT: ; implicit-def: $vgpr4 +; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB7_1 +; GFX908-NEXT: ; %bb.2: +; GFX908-NEXT: s_mov_b64 exec, s[6:7] +; GFX908-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB7_3: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Loop Header: Depth=1 +; GFX908-NEXT: ; Child Loop BB7_4 Depth 2 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] +; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_min_f64 v[11:12], v[0:1], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v11 +; GFX908-NEXT: v_mov_b32_e32 v1, v12 +; GFX908-NEXT: v_mov_b32_e32 v2, v13 +; GFX908-NEXT: v_mov_b32_e32 v3, v14 +; GFX908-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 +; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX908-NEXT: v_readfirstlane_b32 s8, v9 +; GFX908-NEXT: v_readfirstlane_b32 s9, v10 +; GFX908-NEXT: v_readfirstlane_b32 s10, v7 +; GFX908-NEXT: v_readfirstlane_b32 s11, v8 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] +; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] +; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc +; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB7_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 +; GFX908-NEXT: s_mov_b64 exec, s[12:13] +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] +; GFX908-NEXT: v_mov_b32_e32 v14, v1 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v13, v0 +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB7_3 +; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v8, v3 +; GFX8-NEXT: v_mov_b32_e32 v7, v2 +; GFX8-NEXT: v_mov_b32_e32 v10, v1 +; GFX8-NEXT: v_mov_b32_e32 v9, v0 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0x800, v4 +; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_readfirstlane_b32 s8, v9 +; GFX8-NEXT: v_readfirstlane_b32 s9, v10 +; GFX8-NEXT: v_readfirstlane_b32 s10, v7 +; GFX8-NEXT: v_readfirstlane_b32 s11, v8 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] +; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] +; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 +; GFX8-NEXT: ; implicit-def: $vgpr4 +; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB7_1 +; GFX8-NEXT: ; %bb.2: +; GFX8-NEXT: s_mov_b64 exec, s[6:7] +; GFX8-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB7_3: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Loop Header: Depth=1 +; GFX8-NEXT: ; Child Loop BB7_4 Depth 2 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] +; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_min_f64 v[11:12], v[0:1], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v11 +; GFX8-NEXT: v_mov_b32_e32 v1, v12 +; GFX8-NEXT: v_mov_b32_e32 v2, v13 +; GFX8-NEXT: v_mov_b32_e32 v3, v14 +; GFX8-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 +; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX8-NEXT: v_readfirstlane_b32 s8, v9 +; GFX8-NEXT: v_readfirstlane_b32 s9, v10 +; GFX8-NEXT: v_readfirstlane_b32 s10, v7 +; GFX8-NEXT: v_readfirstlane_b32 s11, v8 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] +; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] +; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc +; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB7_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 +; GFX8-NEXT: s_mov_b64 exec, s[12:13] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] +; GFX8-NEXT: v_mov_b32_e32 v14, v1 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v13, v0 +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB7_3 +; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[6:7], exec +; GFX7-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_readfirstlane_b32 s8, v0 +; GFX7-NEXT: v_readfirstlane_b32 s9, v1 +; GFX7-NEXT: v_readfirstlane_b32 s10, v2 +; GFX7-NEXT: v_readfirstlane_b32 s11, v3 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_atomic_fmin_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX7-NEXT: ; implicit-def: $vgpr4 +; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB7_1 +; GFX7-NEXT: ; %bb.2: +; GFX7-NEXT: s_mov_b64 exec, s[6:7] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, v5 +; GFX7-NEXT: v_mov_b32_e32 v1, v6 +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b64 s[6:7], exec +; GFX6-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_readfirstlane_b32 s8, v0 +; GFX6-NEXT: v_readfirstlane_b32 s9, v1 +; GFX6-NEXT: v_readfirstlane_b32 s10, v2 +; GFX6-NEXT: v_readfirstlane_b32 s11, v3 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: buffer_atomic_fmin_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc +; GFX6-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX6-NEXT: ; implicit-def: $vgpr4 +; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB7_1 +; GFX6-NEXT: ; %bb.2: +; GFX6-NEXT: s_mov_b64 exec, s[6:7] +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, v5 +; GFX6-NEXT: v_mov_b32_e32 v1, v6 +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret double %result +} + +define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, double %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 +; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_execnz .LBB8_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-NEXT: s_add_i32 s4, s6, 0x800 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX11-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s18 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 +; GFX10-NEXT: s_add_i32 s4, s18, 0x800 +; GFX10-NEXT: v_mov_b32_e32 v6, s4 +; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v10, v1 +; GFX10-NEXT: v_mov_b32_e32 v9, v0 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX10-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] +; GFX10-NEXT: v_mov_b32_e32 v0, v7 +; GFX10-NEXT: v_mov_b32_e32 v1, v8 +; GFX10-NEXT: v_mov_b32_e32 v2, v9 +; GFX10-NEXT: v_mov_b32_e32 v3, v10 +; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB8_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s18 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 +; GFX90A-NEXT: s_add_i32 s6, s18, 0x800 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX90A-NEXT: v_mov_b32_e32 v6, s6 +; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11] +; GFX90A-NEXT: v_min_f64 v[8:9], v[0:1], v[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[8:9] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[10:11], v[10:11] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s18 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: s_add_i32 s6, s18, 0x800 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s6 +; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v10, v1 +; GFX908-NEXT: v_mov_b32_e32 v9, v0 +; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX908-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v7 +; GFX908-NEXT: v_mov_b32_e32 v1, v8 +; GFX908-NEXT: v_mov_b32_e32 v2, v9 +; GFX908-NEXT: v_mov_b32_e32 v3, v10 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB8_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s18 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: s_add_i32 s6, s18, 0x800 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v10, v1 +; GFX8-NEXT: v_mov_b32_e32 v9, v0 +; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX8-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v7 +; GFX8-NEXT: v_mov_b32_e32 v1, v8 +; GFX8-NEXT: v_mov_b32_e32 v2, v9 +; GFX8-NEXT: v_mov_b32_e32 v3, v10 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB8_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s18 +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 +; GFX7-NEXT: s_add_i32 s6, s18, 0x800 +; GFX7-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v10, v1 +; GFX7-NEXT: v_mov_b32_e32 v9, v0 +; GFX7-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX7-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v7 +; GFX7-NEXT: v_mov_b32_e32 v1, v8 +; GFX7-NEXT: v_mov_b32_e32 v2, v9 +; GFX7-NEXT: v_mov_b32_e32 v3, v10 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s18 +; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 +; GFX6-NEXT: s_add_i32 s6, s18, 0x800 +; GFX6-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: v_mov_b32_e32 v6, s6 +; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v10, v1 +; GFX6-NEXT: v_mov_b32_e32 v9, v0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX6-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] +; GFX6-NEXT: v_mov_b32_e32 v0, v7 +; GFX6-NEXT: v_mov_b32_e32 v1, v8 +; GFX6-NEXT: v_mov_b32_e32 v2, v9 +; GFX6-NEXT: v_mov_b32_e32 v3, v10 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB8_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 + ret double %result +} + +define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, double %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 +; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_execnz .LBB9_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-NEXT: s_add_i32 s4, s6, 0x800 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX11-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB9_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v2, s18 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: v_mov_b32_e32 v2, s18 +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s18 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: s_add_i32 s6, s18, 0x800 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s6 +; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB5_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1 -; GFX908-NEXT: s_mov_b64 exec, s[12:13] +; GFX908-NEXT: v_mov_b32_e32 v10, v1 +; GFX908-NEXT: v_mov_b32_e32 v9, v0 +; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX908-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v7 +; GFX908-NEXT: v_mov_b32_e32 v1, v8 +; GFX908-NEXT: v_mov_b32_e32 v2, v9 +; GFX908-NEXT: v_mov_b32_e32 v3, v10 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] -; GFX908-NEXT: v_mov_b32_e32 v14, v1 -; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v13, v0 ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB5_3 -; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB9_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v8, v3 -; GFX8-NEXT: v_mov_b32_e32 v7, v2 +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s18 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: s_add_i32 s6, s18, 0x800 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v10, v1 ; GFX8-NEXT: v_mov_b32_e32 v9, v0 -; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0x800, v4 -; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_readfirstlane_b32 s8, v9 -; GFX8-NEXT: v_readfirstlane_b32 s9, v10 -; GFX8-NEXT: v_readfirstlane_b32 s10, v7 -; GFX8-NEXT: v_readfirstlane_b32 s11, v8 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] -; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] -; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 -; GFX8-NEXT: ; implicit-def: $vgpr4 -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB5_1 -; GFX8-NEXT: ; %bb.2: -; GFX8-NEXT: s_mov_b64 exec, s[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] -; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: .LBB5_3: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB5_4 Depth 2 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] -; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_min_f64 v[11:12], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v11 -; GFX8-NEXT: v_mov_b32_e32 v1, v12 -; GFX8-NEXT: v_mov_b32_e32 v2, v13 -; GFX8-NEXT: v_mov_b32_e32 v3, v14 -; GFX8-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 -; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX8-NEXT: v_readfirstlane_b32 s8, v9 -; GFX8-NEXT: v_readfirstlane_b32 s9, v10 -; GFX8-NEXT: v_readfirstlane_b32 s10, v7 -; GFX8-NEXT: v_readfirstlane_b32 s11, v8 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] -; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] -; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB5_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1 -; GFX8-NEXT: s_mov_b64 exec, s[12:13] +; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX8-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v7 +; GFX8-NEXT: v_mov_b32_e32 v1, v8 +; GFX8-NEXT: v_mov_b32_e32 v2, v9 +; GFX8-NEXT: v_mov_b32_e32 v3, v10 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] -; GFX8-NEXT: v_mov_b32_e32 v14, v1 -; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v13, v0 ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB5_3 -; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB9_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_readfirstlane_b32 s8, v0 -; GFX7-NEXT: v_readfirstlane_b32 s9, v1 -; GFX7-NEXT: v_readfirstlane_b32 s10, v2 -; GFX7-NEXT: v_readfirstlane_b32 s11, v3 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_fmin_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc -; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX7-NEXT: ; implicit-def: $vgpr4 -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB5_1 -; GFX7-NEXT: ; %bb.2: -; GFX7-NEXT: s_mov_b64 exec, s[6:7] +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s18 +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, v5 -; GFX7-NEXT: v_mov_b32_e32 v1, v6 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_readfirstlane_b32 s8, v0 -; GFX6-NEXT: v_readfirstlane_b32 s9, v1 -; GFX6-NEXT: v_readfirstlane_b32 s10, v2 -; GFX6-NEXT: v_readfirstlane_b32 s11, v3 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_fmin_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc -; GFX6-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX6-NEXT: ; implicit-def: $vgpr4 -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB5_1 -; GFX6-NEXT: ; %bb.2: -; GFX6-NEXT: s_mov_b64 exec, s[6:7] +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s18 +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v5 -; GFX6-NEXT: v_mov_b32_e32 v1, v6 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret double %result } @@ -1660,8 +2670,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall(ptr a ; half ; -------------------------------------------------------------------- -define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) inreg %ptr, half %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset: +define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1680,7 +2690,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i ; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -1703,13 +2713,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB6_1 +; GFX12-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_addk_i32 s6, 0x200 @@ -1722,7 +2732,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i ; GFX940-NEXT: s_not_b32 s7, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX940-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v1 @@ -1739,13 +2749,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB6_1 +; GFX940-NEXT: s_cbranch_execnz .LBB10_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_addk_i32 s6, 0x200 @@ -1761,7 +2771,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i ; GFX11-NEXT: s_not_b32 s6, s5 ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -1784,13 +2794,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB6_1 +; GFX11-NEXT: s_cbranch_execnz .LBB10_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s18, 0x200 @@ -1807,7 +2817,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX10-NEXT: s_not_b32 s6, s5 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -1826,13 +2836,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i ; GFX10-NEXT: v_mov_b32_e32 v1, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB6_1 +; GFX10-NEXT: s_cbranch_execnz .LBB10_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s18, 0x200 @@ -1849,7 +2859,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i ; GFX90A-NEXT: s_not_b32 s7, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v1 @@ -1865,13 +2875,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s18, 0x200 @@ -1888,7 +2898,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i ; GFX908-NEXT: s_not_b32 s7, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1 @@ -1905,13 +2915,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB6_1 +; GFX908-NEXT: s_cbranch_execnz .LBB10_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s18, 0x200 @@ -1928,7 +2938,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i ; GFX8-NEXT: s_not_b32 s7, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1 @@ -1946,13 +2956,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB6_1 +; GFX8-NEXT: s_cbranch_execnz .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s18, 0x200 @@ -1970,7 +2980,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 @@ -1989,14 +2999,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB6_1 +; GFX7-NEXT: s_cbranch_execnz .LBB10_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s18, 0x200 @@ -2014,7 +3024,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 @@ -2034,7 +3044,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB6_1 +; GFX6-NEXT: s_cbranch_execnz .LBB10_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 @@ -2042,12 +3052,12 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result } -define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7) inreg %ptr, half %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset: +define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -2066,7 +3076,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7) ; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -2089,12 +3099,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7) ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB7_1 +; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_addk_i32 s6, 0x200 @@ -2107,7 +3117,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7) ; GFX940-NEXT: s_not_b32 s7, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX940-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v1 @@ -2124,12 +3134,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7) ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB7_1 +; GFX940-NEXT: s_cbranch_execnz .LBB11_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_addk_i32 s6, 0x200 @@ -2145,7 +3155,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7) ; GFX11-NEXT: s_not_b32 s6, s5 ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -2168,12 +3178,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7) ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB7_1 +; GFX11-NEXT: s_cbranch_execnz .LBB11_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s18, 0x200 @@ -2190,7 +3200,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7) ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX10-NEXT: s_not_b32 s6, s5 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -2209,12 +3219,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7) ; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB7_1 +; GFX10-NEXT: s_cbranch_execnz .LBB11_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s18, 0x200 @@ -2231,7 +3241,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7) ; GFX90A-NEXT: s_not_b32 s7, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v1 @@ -2247,12 +3257,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7) ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s18, 0x200 @@ -2269,7 +3279,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7) ; GFX908-NEXT: s_not_b32 s7, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1 @@ -2286,12 +3296,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7) ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB7_1 +; GFX908-NEXT: s_cbranch_execnz .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s18, 0x200 @@ -2308,7 +3318,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7) ; GFX8-NEXT: s_not_b32 s7, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1 @@ -2326,12 +3336,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7) ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB7_1 +; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s18, 0x200 @@ -2349,7 +3359,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7) ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 @@ -2368,12 +3378,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7) ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB7_1 +; GFX7-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s18, 0x200 @@ -2391,7 +3401,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7) ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 @@ -2411,18 +3421,18 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7) ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v1, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB7_1 +; GFX6-NEXT: s_cbranch_execnz .LBB11_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fmin ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr addrspace(7) %ptr, half %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall: +define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, half %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -2438,7 +3448,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff ; GFX12-NEXT: v_not_b32_e32 v9, v6 -; GFX12-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 @@ -2451,14 +3461,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB8_1 +; GFX12-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_max_num_f16_e32 v10, v5, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX12-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX12-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX12-NEXT: s_mov_b32 s2, exec_lo @@ -2474,7 +3484,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX12-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 @@ -2489,8 +3499,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB8_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX12-NEXT: s_cbranch_execnz .LBB12_4 +; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -2499,13 +3509,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB8_3 +; GFX12-NEXT: s_cbranch_execnz .LBB12_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4 @@ -2516,7 +3526,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v8, s0 ; GFX940-NEXT: v_not_b32_e32 v10, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 ; GFX940-NEXT: v_readfirstlane_b32 s6, v2 @@ -2528,14 +3538,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB8_1 +; GFX940-NEXT: s_cbranch_execnz .LBB12_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 ; GFX940-NEXT: v_max_f16_e32 v11, v5, v5 -; GFX940-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX940-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX940-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v4, v8, v7 ; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 @@ -2545,7 +3555,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX940-NEXT: s_mov_b64 s[8:9], exec ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX940-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 @@ -2559,8 +3569,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB8_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX940-NEXT: s_cbranch_execnz .LBB12_4 +; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -2568,13 +3578,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB8_3 +; GFX940-NEXT: s_cbranch_execnz .LBB12_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 @@ -2587,7 +3597,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff ; GFX11-NEXT: v_not_b32_e32 v9, v6 -; GFX11-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 ; GFX11-NEXT: v_readfirstlane_b32 s6, v2 @@ -2600,14 +3610,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-NEXT: s_cbranch_execnz .LBB12_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_max_f16_e32 v10, v5, v5 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX11-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX11-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX11-NEXT: s_mov_b32 s2, exec_lo @@ -2622,7 +3632,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX11-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX11-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 @@ -2637,8 +3647,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB8_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX11-NEXT: s_cbranch_execnz .LBB12_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -2648,13 +3658,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB8_3 +; GFX11-NEXT: s_cbranch_execnz .LBB12_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 @@ -2665,7 +3675,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v6 ; GFX10-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff ; GFX10-NEXT: v_not_b32_e32 v9, v6 -; GFX10-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 ; GFX10-NEXT: v_readfirstlane_b32 s10, v2 @@ -2677,13 +3687,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB8_1 +; GFX10-NEXT: s_cbranch_execnz .LBB12_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: v_max_f16_e32 v10, v5, v5 -; GFX10-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX10-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 -; GFX10-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX10-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX10-NEXT: s_mov_b32 s6, exec_lo @@ -2694,7 +3704,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX10-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v6 -; GFX10-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX10-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 @@ -2708,8 +3718,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB8_4 -; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX10-NEXT: s_cbranch_execnz .LBB12_4 +; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -2719,13 +3729,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB8_3 +; GFX10-NEXT: s_cbranch_execnz .LBB12_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4 @@ -2736,7 +3746,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v8, s4 ; GFX90A-NEXT: v_not_b32_e32 v10, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 @@ -2748,14 +3758,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_max_f16_e32 v11, v5, v5 -; GFX90A-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX90A-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 -; GFX90A-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX90A-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v8, v7 ; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 @@ -2764,7 +3774,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX90A-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 @@ -2777,8 +3787,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB8_4 -; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX90A-NEXT: s_cbranch_execnz .LBB12_4 +; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -2786,13 +3796,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB8_3 +; GFX90A-NEXT: s_cbranch_execnz .LBB12_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4 @@ -2803,7 +3813,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v7, s4 ; GFX908-NEXT: v_not_b32_e32 v9, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 ; GFX908-NEXT: v_readfirstlane_b32 s10, v2 @@ -2815,14 +3825,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB8_1 +; GFX908-NEXT: s_cbranch_execnz .LBB12_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_max_f16_e32 v10, v5, v5 -; GFX908-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX908-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX908-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX908-NEXT: v_max_f16_e32 v4, v4, v4 @@ -2832,7 +3842,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec ; GFX908-NEXT: v_mov_b32_e32 v5, v6 -; GFX908-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX908-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 @@ -2845,8 +3855,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB8_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX908-NEXT: s_cbranch_execnz .LBB12_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -2854,13 +3864,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB8_3 +; GFX908-NEXT: s_cbranch_execnz .LBB12_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4 @@ -2871,7 +3881,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v7, s4 ; GFX8-NEXT: v_not_b32_e32 v9, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 ; GFX8-NEXT: v_readfirstlane_b32 s10, v2 @@ -2883,14 +3893,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB8_1 +; GFX8-NEXT: s_cbranch_execnz .LBB12_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_max_f16_e32 v10, v5, v5 -; GFX8-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX8-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX8-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX8-NEXT: v_max_f16_e32 v4, v4, v4 @@ -2901,7 +3911,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec ; GFX8-NEXT: v_mov_b32_e32 v5, v6 -; GFX8-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX8-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 @@ -2914,8 +3924,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB8_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX8-NEXT: s_cbranch_execnz .LBB12_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -2923,13 +3933,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB8_3 +; GFX8-NEXT: s_cbranch_execnz .LBB12_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 @@ -2939,7 +3949,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 ; GFX7-NEXT: v_not_b32_e32 v9, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 ; GFX7-NEXT: v_readfirstlane_b32 s10, v2 @@ -2950,15 +3960,15 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7-NEXT: s_cbranch_execnz .LBB12_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v5 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v4 -; GFX7-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX7-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 -; GFX7-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX7-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 @@ -2970,7 +3980,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: v_mov_b32_e32 v5, v6 -; GFX7-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX7-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 @@ -2983,8 +3993,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB8_4 -; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX7-NEXT: s_cbranch_execnz .LBB12_4 +; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -2992,14 +4002,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB8_3 +; GFX7-NEXT: s_cbranch_execnz .LBB12_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 @@ -3009,7 +4019,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 ; GFX6-NEXT: v_not_b32_e32 v9, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 ; GFX6-NEXT: v_readfirstlane_b32 s10, v2 @@ -3020,15 +4030,15 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB8_1 +; GFX6-NEXT: s_cbranch_execnz .LBB12_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v5 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v4 -; GFX6-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX6-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 -; GFX6-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX6-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 @@ -3040,7 +4050,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: v_mov_b32_e32 v5, v6 -; GFX6-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX6-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 @@ -3053,8 +4063,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB8_4 -; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX6-NEXT: s_cbranch_execnz .LBB12_4 +; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -3062,7 +4072,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB8_3 +; GFX6-NEXT: s_cbranch_execnz .LBB12_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4 @@ -3070,7 +4080,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result } @@ -3078,8 +4088,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; bfloat ; -------------------------------------------------------------------- -define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset: +define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -3098,7 +4108,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7 ; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -3127,13 +4137,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7 ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB9_1 +; GFX12-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_addk_i32 s6, 0x200 @@ -3147,7 +4157,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -3169,13 +4179,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB9_1 +; GFX940-NEXT: s_cbranch_execnz .LBB13_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_addk_i32 s6, 0x200 @@ -3191,7 +4201,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7 ; GFX11-NEXT: s_not_b32 s6, s5 ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -3221,13 +4231,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB9_1 +; GFX11-NEXT: s_cbranch_execnz .LBB13_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s18, 0x200 @@ -3244,7 +4254,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7 ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX10-NEXT: s_not_b32 s6, s5 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -3267,13 +4277,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7 ; GFX10-NEXT: v_mov_b32_e32 v1, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB9_1 +; GFX10-NEXT: s_cbranch_execnz .LBB13_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s18, 0x200 @@ -3291,7 +4301,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX90A-NEXT: s_movk_i32 s12, 0x7fff -; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -3311,13 +4321,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s18, 0x200 @@ -3335,7 +4345,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX908-NEXT: s_movk_i32 s12, 0x7fff -; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -3356,13 +4366,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB9_1 +; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s18, 0x200 @@ -3379,7 +4389,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7 ; GFX8-NEXT: s_not_b32 s7, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -3403,13 +4413,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB9_1 +; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s18, 0x200 @@ -3427,7 +4437,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7 ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 @@ -3447,14 +4457,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB9_1 +; GFX7-NEXT: s_cbranch_execnz .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s18, 0x200 @@ -3472,7 +4482,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7 ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; GFX6-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 @@ -3493,7 +4503,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB9_1 +; GFX6-NEXT: s_cbranch_execnz .LBB13_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 @@ -3501,12 +4511,12 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } -define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset: +define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -3525,7 +4535,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7 ; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -3554,12 +4564,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7 ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB10_1 +; GFX12-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_addk_i32 s6, 0x200 @@ -3573,7 +4583,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -3595,12 +4605,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB10_1 +; GFX940-NEXT: s_cbranch_execnz .LBB14_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_addk_i32 s6, 0x200 @@ -3616,7 +4626,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7 ; GFX11-NEXT: s_not_b32 s6, s5 ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -3646,12 +4656,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB10_1 +; GFX11-NEXT: s_cbranch_execnz .LBB14_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s18, 0x200 @@ -3668,7 +4678,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7 ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX10-NEXT: s_not_b32 s6, s5 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -3691,12 +4701,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB10_1 +; GFX10-NEXT: s_cbranch_execnz .LBB14_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s18, 0x200 @@ -3714,7 +4724,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX90A-NEXT: s_movk_i32 s12, 0x7fff -; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -3734,12 +4744,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s18, 0x200 @@ -3757,7 +4767,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX908-NEXT: s_movk_i32 s12, 0x7fff -; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -3778,12 +4788,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB10_1 +; GFX908-NEXT: s_cbranch_execnz .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s18, 0x200 @@ -3800,7 +4810,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7 ; GFX8-NEXT: s_not_b32 s7, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -3824,12 +4834,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB10_1 +; GFX8-NEXT: s_cbranch_execnz .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s18, 0x200 @@ -3847,7 +4857,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7 ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 @@ -3867,12 +4877,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB10_1 +; GFX7-NEXT: s_cbranch_execnz .LBB14_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s18, 0x200 @@ -3890,7 +4900,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7 ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 @@ -3911,18 +4921,18 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v1, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB10_1 +; GFX6-NEXT: s_cbranch_execnz .LBB14_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fmin ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr addrspace(7) %ptr, bfloat %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall: +define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, bfloat %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -3938,7 +4948,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff ; GFX12-NEXT: v_not_b32_e32 v9, v6 -; GFX12-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 @@ -3951,14 +4961,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB11_1 +; GFX12-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX12-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX12-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX12-NEXT: s_mov_b32 s2, exec_lo @@ -3980,7 +4990,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX12-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 @@ -3995,8 +5005,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB11_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX12-NEXT: s_cbranch_execnz .LBB15_4 +; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -4005,13 +5015,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB11_3 +; GFX12-NEXT: s_cbranch_execnz .LBB15_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4 @@ -4022,7 +5032,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v8, s0 ; GFX940-NEXT: v_not_b32_e32 v10, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 ; GFX940-NEXT: v_readfirstlane_b32 s6, v2 @@ -4034,15 +5044,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB11_1 +; GFX940-NEXT: s_cbranch_execnz .LBB15_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v11, 16, v5 ; GFX940-NEXT: s_movk_i32 s10, 0x7fff -; GFX940-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX940-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX940-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX940-NEXT: s_mov_b64 s[8:9], exec @@ -4057,7 +5067,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX940-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX940-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] -; GFX940-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX940-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 @@ -4071,8 +5081,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB11_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX940-NEXT: s_cbranch_execnz .LBB15_4 +; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -4080,13 +5090,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB11_3 +; GFX940-NEXT: s_cbranch_execnz .LBB15_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 @@ -4099,7 +5109,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff ; GFX11-NEXT: v_not_b32_e32 v9, v6 -; GFX11-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 ; GFX11-NEXT: v_readfirstlane_b32 s6, v2 @@ -4112,15 +5122,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB11_1 +; GFX11-NEXT: s_cbranch_execnz .LBB15_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX11-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX11-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX11-NEXT: s_mov_b32 s2, exec_lo @@ -4142,7 +5152,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v4, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX11-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 @@ -4157,8 +5167,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB11_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX11-NEXT: s_cbranch_execnz .LBB15_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -4168,14 +5178,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB11_3 +; GFX11-NEXT: s_cbranch_execnz .LBB15_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 @@ -4186,7 +5196,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v6 ; GFX10-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff ; GFX10-NEXT: v_not_b32_e32 v9, v6 -; GFX10-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 ; GFX10-NEXT: v_readfirstlane_b32 s10, v2 @@ -4198,13 +5208,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB11_1 +; GFX10-NEXT: s_cbranch_execnz .LBB15_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX10-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX10-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 -; GFX10-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX10-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_mov_b32 s6, exec_lo @@ -4219,7 +5229,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX10-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v6 -; GFX10-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX10-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 @@ -4233,8 +5243,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB11_4 -; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX10-NEXT: s_cbranch_execnz .LBB15_4 +; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -4244,13 +5254,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB11_3 +; GFX10-NEXT: s_cbranch_execnz .LBB15_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4 @@ -4261,7 +5271,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v8, s4 ; GFX90A-NEXT: v_not_b32_e32 v10, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 @@ -4273,15 +5283,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v11, 16, v5 ; GFX90A-NEXT: s_movk_i32 s14, 0x7fff -; GFX90A-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX90A-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 -; GFX90A-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX90A-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX90A-NEXT: v_min_f32_e32 v4, v4, v11 @@ -4294,7 +5304,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX90A-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 @@ -4307,8 +5317,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB11_4 -; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX90A-NEXT: s_cbranch_execnz .LBB15_4 +; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -4316,13 +5326,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB11_3 +; GFX90A-NEXT: s_cbranch_execnz .LBB15_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4 @@ -4333,7 +5343,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v7, s4 ; GFX908-NEXT: v_not_b32_e32 v9, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 ; GFX908-NEXT: v_readfirstlane_b32 s10, v2 @@ -4345,15 +5355,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB11_1 +; GFX908-NEXT: s_cbranch_execnz .LBB15_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; GFX908-NEXT: s_movk_i32 s14, 0x7fff -; GFX908-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX908-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX908-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX908-NEXT: v_min_f32_e32 v4, v4, v10 @@ -4367,7 +5377,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec ; GFX908-NEXT: v_mov_b32_e32 v5, v6 -; GFX908-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX908-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 @@ -4380,8 +5390,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB11_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX908-NEXT: s_cbranch_execnz .LBB15_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -4389,13 +5399,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB11_3 +; GFX908-NEXT: s_cbranch_execnz .LBB15_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4 @@ -4406,7 +5416,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v7, s4 ; GFX8-NEXT: v_not_b32_e32 v9, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 ; GFX8-NEXT: v_readfirstlane_b32 s10, v2 @@ -4418,14 +5428,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB11_1 +; GFX8-NEXT: s_cbranch_execnz .LBB15_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX8-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX8-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX8-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_min_f32_e32 v4, v4, v10 @@ -4441,7 +5451,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec ; GFX8-NEXT: v_mov_b32_e32 v5, v6 -; GFX8-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX8-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 @@ -4454,8 +5464,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB11_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX8-NEXT: s_cbranch_execnz .LBB15_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -4463,13 +5473,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB11_3 +; GFX8-NEXT: s_cbranch_execnz .LBB15_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 @@ -4479,7 +5489,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 ; GFX7-NEXT: v_not_b32_e32 v9, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 ; GFX7-NEXT: v_readfirstlane_b32 s10, v2 @@ -4490,15 +5500,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB11_1 +; GFX7-NEXT: s_cbranch_execnz .LBB15_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v5 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 -; GFX7-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX7-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 -; GFX7-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX7-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -4511,7 +5521,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_mov_b64 s[12:13], exec ; GFX7-NEXT: v_mov_b32_e32 v5, v6 -; GFX7-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX7-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 @@ -4524,8 +5534,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB11_4 -; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX7-NEXT: s_cbranch_execnz .LBB15_4 +; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -4533,14 +5543,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB11_3 +; GFX7-NEXT: s_cbranch_execnz .LBB15_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 @@ -4550,7 +5560,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 ; GFX6-NEXT: v_not_b32_e32 v9, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 ; GFX6-NEXT: v_readfirstlane_b32 s10, v2 @@ -4561,15 +5571,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB11_1 +; GFX6-NEXT: s_cbranch_execnz .LBB15_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v5 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 ; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 -; GFX6-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX6-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 -; GFX6-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX6-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -4582,7 +5592,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_mov_b64 s[12:13], exec ; GFX6-NEXT: v_mov_b32_e32 v5, v6 -; GFX6-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX6-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 @@ -4595,8 +5605,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB11_4 -; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX6-NEXT: s_cbranch_execnz .LBB15_4 +; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -4604,7 +5614,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB11_3 +; GFX6-NEXT: s_cbranch_execnz .LBB15_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4 @@ -4612,7 +5622,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } @@ -4620,8 +5630,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; <2 x half> ; -------------------------------------------------------------------- -define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset: +define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -4635,7 +5645,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp ; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1 ; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v5, v0 @@ -4652,12 +5662,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB12_1 +; GFX12-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, v0 @@ -4667,7 +5677,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 -; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v0 @@ -4682,12 +5692,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB12_1 +; GFX940-NEXT: s_cbranch_execnz .LBB16_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 @@ -4697,7 +5707,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp ; GFX11-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 @@ -4715,12 +5725,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB12_1 +; GFX11-NEXT: s_cbranch_execnz .LBB16_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 @@ -4734,7 +5744,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp ; GFX10-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 @@ -4750,12 +5760,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB12_1 +; GFX10-NEXT: s_cbranch_execnz .LBB16_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 @@ -4769,7 +5779,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 -; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 @@ -4782,12 +5792,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 @@ -4801,7 +5811,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 -; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v5, v0 @@ -4815,12 +5825,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB12_1 +; GFX908-NEXT: s_cbranch_execnz .LBB16_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -4835,7 +5845,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp ; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v3, v1, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -4852,12 +5862,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB12_1 +; GFX8-NEXT: s_cbranch_execnz .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s11, s17 @@ -4877,7 +5887,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX7-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -4902,12 +5912,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB12_1 +; GFX7-NEXT: s_cbranch_execnz .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s17 @@ -4927,7 +5937,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX6-NEXT: v_mov_b32_e32 v4, s6 -; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -4953,18 +5963,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB12_1 +; GFX6-NEXT: s_cbranch_execnz .LBB16_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } -define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset: +define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -4977,7 +5987,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace( ; GFX12-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 -; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v0, v1, v1 @@ -4994,12 +6004,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace( ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB13_1 +; GFX12-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, s6 @@ -5008,7 +6018,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace( ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v0, v0 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 -; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_pk_max_f16 v0, v1, v1 @@ -5023,12 +6033,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace( ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB13_1 +; GFX940-NEXT: s_cbranch_execnz .LBB17_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, s6 @@ -5037,7 +6047,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace( ; GFX11-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 -; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v0, v1, v1 @@ -5054,12 +6064,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace( ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB13_1 +; GFX11-NEXT: s_cbranch_execnz .LBB17_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s18 @@ -5072,7 +6082,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace( ; GFX10-NEXT: v_pk_max_f16 v2, v0, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v0, v1, v1 @@ -5088,12 +6098,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace( ; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB13_1 +; GFX10-NEXT: s_cbranch_execnz .LBB17_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_mov_b32 s11, s17 @@ -5106,7 +6116,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace( ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v0, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 -; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 @@ -5119,12 +6129,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace( ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_mov_b32 s11, s17 @@ -5137,7 +6147,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace( ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v0, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 -; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v0, v1, v1 @@ -5151,12 +6161,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace( ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB13_1 +; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s11, s17 @@ -5170,7 +6180,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace( ; GFX8-NEXT: v_max_f16_sdwa v2, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -5187,12 +6197,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace( ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB13_1 +; GFX8-NEXT: s_cbranch_execnz .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s11, s17 @@ -5212,7 +6222,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace( ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 @@ -5237,12 +6247,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace( ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB13_1 +; GFX7-NEXT: s_cbranch_execnz .LBB17_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s17 @@ -5262,7 +6272,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace( ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 ; GFX6-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 @@ -5288,18 +6298,18 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace( ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB13_1 +; GFX6-NEXT: s_cbranch_execnz .LBB17_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall(ptr addrspace(7) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall: +define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -5308,7 +6318,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 @@ -5322,14 +6332,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB14_1 +; GFX12-NEXT: s_cbranch_execnz .LBB18_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_pk_max_num_f16 v8, v5, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX12-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX12-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v4, v6, v6 ; GFX12-NEXT: s_mov_b32 s2, exec_lo @@ -5339,7 +6349,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX12-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 @@ -5354,8 +6364,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB14_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX12-NEXT: s_cbranch_execnz .LBB18_4 +; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -5364,18 +6374,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB14_3 +; GFX12-NEXT: s_cbranch_execnz .LBB18_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 ; GFX940-NEXT: v_readfirstlane_b32 s6, v2 @@ -5388,21 +6398,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 ; GFX940-NEXT: ; implicit-def: $vgpr4 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB14_1 +; GFX940-NEXT: s_cbranch_execnz .LBB18_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 ; GFX940-NEXT: v_pk_max_f16 v9, v5, v5 -; GFX940-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX940-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX940-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_pk_max_f16 v4, v7, v7 ; GFX940-NEXT: s_mov_b64 s[8:9], exec ; GFX940-NEXT: v_pk_min_f16 v6, v4, v9 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] -; GFX940-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX940-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 @@ -5416,8 +6426,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB14_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX940-NEXT: s_cbranch_execnz .LBB18_4 +; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -5425,19 +6435,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB14_3 +; GFX940-NEXT: s_cbranch_execnz .LBB18_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 ; GFX11-NEXT: v_readfirstlane_b32 s6, v2 @@ -5451,14 +6461,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 ; GFX11-NEXT: ; implicit-def: $vgpr4 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-NEXT: s_cbranch_execnz .LBB18_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_pk_max_f16 v8, v5, v5 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX11-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX11-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v4, v6, v6 ; GFX11-NEXT: s_mov_b32 s2, exec_lo @@ -5467,7 +6477,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX11-NEXT: v_pk_min_f16 v5, v4, v8 ; GFX11-NEXT: v_mov_b32_e32 v4, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX11-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 @@ -5482,8 +6492,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB14_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX11-NEXT: s_cbranch_execnz .LBB18_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -5493,19 +6503,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB14_3 +; GFX11-NEXT: s_cbranch_execnz .LBB18_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_mov_b32 s6, exec_lo -; GFX10-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 ; GFX10-NEXT: v_readfirstlane_b32 s10, v2 @@ -5518,13 +6528,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX10-NEXT: ; implicit-def: $vgpr4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB14_1 +; GFX10-NEXT: s_cbranch_execnz .LBB18_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: v_pk_max_f16 v8, v5, v5 -; GFX10-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX10-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 -; GFX10-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX10-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v4, v6, v6 ; GFX10-NEXT: s_mov_b32 s6, exec_lo @@ -5532,7 +6542,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX10-NEXT: v_pk_min_f16 v5, v4, v8 ; GFX10-NEXT: v_mov_b32_e32 v4, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v6 -; GFX10-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX10-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 @@ -5546,8 +6556,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB14_4 -; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX10-NEXT: s_cbranch_execnz .LBB18_4 +; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -5557,18 +6567,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB14_3 +; GFX10-NEXT: s_cbranch_execnz .LBB18_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 @@ -5581,20 +6591,20 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX90A-NEXT: ; implicit-def: $vgpr4 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_pk_max_f16 v9, v5, v5 -; GFX90A-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX90A-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 -; GFX90A-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX90A-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v4, v7, v7 ; GFX90A-NEXT: v_pk_min_f16 v6, v4, v9 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX90A-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 @@ -5607,8 +6617,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB14_4 -; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX90A-NEXT: s_cbranch_execnz .LBB18_4 +; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -5616,18 +6626,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB14_3 +; GFX90A-NEXT: s_cbranch_execnz .LBB18_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 ; GFX908-NEXT: v_readfirstlane_b32 s10, v2 @@ -5640,21 +6650,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX908-NEXT: ; implicit-def: $vgpr4 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB14_1 +; GFX908-NEXT: s_cbranch_execnz .LBB18_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_pk_max_f16 v8, v5, v5 -; GFX908-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX908-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX908-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v4, v6, v6 ; GFX908-NEXT: v_pk_min_f16 v5, v4, v8 ; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec ; GFX908-NEXT: v_mov_b32_e32 v5, v6 -; GFX908-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX908-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 @@ -5667,8 +6677,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB14_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX908-NEXT: s_cbranch_execnz .LBB18_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -5676,18 +6686,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB14_3 +; GFX908-NEXT: s_cbranch_execnz .LBB18_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 ; GFX8-NEXT: v_readfirstlane_b32 s10, v2 @@ -5700,15 +6710,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX8-NEXT: ; implicit-def: $vgpr4 ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB14_1 +; GFX8-NEXT: s_cbranch_execnz .LBB18_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_max_f16_sdwa v8, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v9, v5, v5 -; GFX8-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX8-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX8-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 @@ -5718,7 +6728,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec ; GFX8-NEXT: v_mov_b32_e32 v5, v6 -; GFX8-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX8-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 @@ -5731,8 +6741,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB14_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX8-NEXT: s_cbranch_execnz .LBB18_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -5740,18 +6750,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB14_3 +; GFX8-NEXT: s_cbranch_execnz .LBB18_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 ; GFX7-NEXT: v_readfirstlane_b32 s10, v2 @@ -5763,7 +6773,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX7-NEXT: ; implicit-def: $vgpr4 ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB14_1 +; GFX7-NEXT: s_cbranch_execnz .LBB18_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 @@ -5775,9 +6785,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v8 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX7-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 -; GFX7-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX7-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: s_mov_b64 s[12:13], exec @@ -5793,7 +6803,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX7-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 @@ -5806,8 +6816,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB14_4 -; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX7-NEXT: s_cbranch_execnz .LBB18_4 +; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 @@ -5817,19 +6827,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB14_3 +; GFX7-NEXT: s_cbranch_execnz .LBB18_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_mov_b32_e32 v0, v4 ; GFX7-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 ; GFX6-NEXT: v_readfirstlane_b32 s10, v2 @@ -5841,7 +6851,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX6-NEXT: ; implicit-def: $vgpr4 ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB14_1 +; GFX6-NEXT: s_cbranch_execnz .LBB18_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 @@ -5853,9 +6863,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v11, v8 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 -; GFX6-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX6-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 -; GFX6-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX6-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: s_mov_b64 s[12:13], exec @@ -5872,7 +6882,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX6-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 @@ -5885,8 +6895,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB14_4 -; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX6-NEXT: s_cbranch_execnz .LBB18_4 +; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 @@ -5896,7 +6906,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB14_3 +; GFX6-NEXT: s_cbranch_execnz .LBB18_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v0, v4 @@ -5904,7 +6914,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } @@ -5912,8 +6922,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; <2 x bfloat> ; -------------------------------------------------------------------- -define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset: +define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -5928,7 +6938,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v0 @@ -5961,12 +6971,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB15_1 +; GFX12-NEXT: s_cbranch_execnz .LBB19_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, v0 @@ -5979,7 +6989,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add ; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX940-NEXT: s_mov_b32 s9, 0x7060302 ; GFX940-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v0 @@ -6006,12 +7016,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_cbranch_execnz .LBB15_1 +; GFX940-NEXT: s_cbranch_execnz .LBB19_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 @@ -6024,7 +7034,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v0 @@ -6058,13 +7068,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB15_1 +; GFX11-NEXT: s_cbranch_execnz .LBB19_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 @@ -6079,7 +7089,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, s4 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v0 @@ -6108,12 +7118,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB15_1 +; GFX10-NEXT: s_cbranch_execnz .LBB19_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 @@ -6130,7 +7140,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add ; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s13, 0x7060302 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v0 @@ -6156,12 +7166,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB19_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 @@ -6178,7 +7188,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add ; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s13, 0x7060302 ; GFX908-NEXT: v_mov_b32_e32 v4, s4 -; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v0 @@ -6205,12 +7215,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB15_1 +; GFX908-NEXT: s_cbranch_execnz .LBB19_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -6225,7 +7235,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -6255,12 +7265,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB15_1 +; GFX8-NEXT: s_cbranch_execnz .LBB19_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s11, s17 @@ -6279,7 +7289,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX7-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -6301,12 +7311,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB15_1 +; GFX7-NEXT: s_cbranch_execnz .LBB19_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s17 @@ -6325,7 +7335,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX6-NEXT: v_mov_b32_e32 v4, s6 -; GFX6-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -6348,18 +7358,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB15_1 +; GFX6-NEXT: s_cbranch_execnz .LBB19_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } -define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset: +define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -6372,7 +7382,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace ; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 ; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 @@ -6403,12 +7413,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB16_1 +; GFX12-NEXT: s_cbranch_execnz .LBB20_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, s6 @@ -6420,7 +7430,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace ; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX940-NEXT: s_mov_b32 s9, 0x7060302 ; GFX940-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -6447,12 +7457,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v1, v6 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_cbranch_execnz .LBB16_1 +; GFX940-NEXT: s_cbranch_execnz .LBB20_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_lshlrev_b32 v2, 16, v0 @@ -6463,7 +7473,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 @@ -6495,13 +7505,13 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB16_1 +; GFX11-NEXT: s_cbranch_execnz .LBB20_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s18 @@ -6515,7 +7525,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, s4 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -6544,12 +7554,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace ; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB16_1 +; GFX10-NEXT: s_cbranch_execnz .LBB20_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_mov_b32 s11, s17 @@ -6565,7 +7575,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace ; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX90A-NEXT: s_mov_b32 s13, 0x7060302 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -6591,12 +7601,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_mov_b32 s11, s17 @@ -6612,7 +7622,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace ; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX908-NEXT: s_mov_b32 s13, 0x7060302 ; GFX908-NEXT: v_mov_b32_e32 v4, s4 -; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -6639,12 +7649,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB16_1 +; GFX908-NEXT: s_cbranch_execnz .LBB20_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s11, s17 @@ -6658,7 +7668,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -6688,12 +7698,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB16_1 +; GFX8-NEXT: s_cbranch_execnz .LBB20_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s11, s17 @@ -6712,7 +7722,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -6734,12 +7744,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB16_1 +; GFX7-NEXT: s_cbranch_execnz .LBB20_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s17 @@ -6758,7 +7768,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX6-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -6781,18 +7791,18 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB16_1 +; GFX6-NEXT: s_cbranch_execnz .LBB20_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall(ptr addrspace(7) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall: +define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -6801,7 +7811,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 @@ -6815,15 +7825,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB17_1 +; GFX12-NEXT: s_cbranch_execnz .LBB21_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX12-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX12-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX12-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v6 @@ -6847,7 +7857,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX12-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 @@ -6862,8 +7872,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB17_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX12-NEXT: s_cbranch_execnz .LBB21_4 +; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -6872,18 +7882,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB17_3 +; GFX12-NEXT: s_cbranch_execnz .LBB21_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 ; GFX940-NEXT: v_readfirstlane_b32 s6, v2 @@ -6896,7 +7906,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 ; GFX940-NEXT: ; implicit-def: $vgpr4 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB17_1 +; GFX940-NEXT: s_cbranch_execnz .LBB21_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 @@ -6904,9 +7914,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX940-NEXT: s_movk_i32 s10, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 ; GFX940-NEXT: s_mov_b32 s11, 0x7060302 -; GFX940-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX940-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX940-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v7 ; GFX940-NEXT: v_min_f32_e32 v4, v4, v9 @@ -6927,7 +7937,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc ; GFX940-NEXT: v_perm_b32 v6, v5, v4, s11 ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] -; GFX940-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX940-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 @@ -6941,8 +7951,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB17_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX940-NEXT: s_cbranch_execnz .LBB21_4 +; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -6950,19 +7960,19 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB17_3 +; GFX940-NEXT: s_cbranch_execnz .LBB21_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 ; GFX11-NEXT: v_readfirstlane_b32 s6, v2 @@ -6976,16 +7986,16 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 ; GFX11-NEXT: ; implicit-def: $vgpr4 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB17_1 +; GFX11-NEXT: s_cbranch_execnz .LBB21_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX11-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX11-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6 @@ -7009,7 +8019,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX11-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 ; GFX11-NEXT: v_mov_b32_e32 v4, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX11-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 @@ -7024,8 +8034,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB17_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX11-NEXT: s_cbranch_execnz .LBB21_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -7035,20 +8045,20 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB17_3 +; GFX11-NEXT: s_cbranch_execnz .LBB21_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_mov_b32 s6, exec_lo -; GFX10-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 ; GFX10-NEXT: v_readfirstlane_b32 s10, v2 @@ -7061,14 +8071,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX10-NEXT: ; implicit-def: $vgpr4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB17_1 +; GFX10-NEXT: s_cbranch_execnz .LBB21_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; GFX10-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX10-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 -; GFX10-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX10-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 @@ -7089,7 +8099,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX10-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v4, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v6 -; GFX10-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX10-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 @@ -7103,8 +8113,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB17_4 -; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX10-NEXT: s_cbranch_execnz .LBB21_4 +; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -7114,18 +8124,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB17_3 +; GFX10-NEXT: s_cbranch_execnz .LBB21_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 @@ -7138,7 +8148,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX90A-NEXT: ; implicit-def: $vgpr4 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 @@ -7146,9 +8156,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX90A-NEXT: s_movk_i32 s14, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 ; GFX90A-NEXT: s_mov_b32 s15, 0x7060302 -; GFX90A-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX90A-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 -; GFX90A-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX90A-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v7 ; GFX90A-NEXT: v_min_f32_e32 v4, v4, v9 @@ -7167,7 +8177,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX90A-NEXT: v_perm_b32 v6, v5, v4, s15 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX90A-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 @@ -7180,8 +8190,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB17_4 -; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX90A-NEXT: s_cbranch_execnz .LBB21_4 +; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -7189,18 +8199,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB17_3 +; GFX90A-NEXT: s_cbranch_execnz .LBB21_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 ; GFX908-NEXT: v_readfirstlane_b32 s10, v2 @@ -7213,7 +8223,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX908-NEXT: ; implicit-def: $vgpr4 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB17_1 +; GFX908-NEXT: s_cbranch_execnz .LBB21_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 @@ -7221,9 +8231,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX908-NEXT: s_movk_i32 s14, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX908-NEXT: s_mov_b32 s15, 0x7060302 -; GFX908-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX908-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX908-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX908-NEXT: v_min_f32_e32 v4, v4, v8 @@ -7243,7 +8253,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec ; GFX908-NEXT: v_mov_b32_e32 v5, v6 -; GFX908-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX908-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 @@ -7256,8 +8266,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB17_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX908-NEXT: s_cbranch_execnz .LBB21_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -7265,18 +8275,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB17_3 +; GFX908-NEXT: s_cbranch_execnz .LBB21_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 ; GFX8-NEXT: v_readfirstlane_b32 s10, v2 @@ -7289,15 +8299,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX8-NEXT: ; implicit-def: $vgpr4 ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB17_1 +; GFX8-NEXT: s_cbranch_execnz .LBB21_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; GFX8-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX8-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX8-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX8-NEXT: v_min_f32_e32 v4, v4, v8 @@ -7320,7 +8330,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec ; GFX8-NEXT: v_mov_b32_e32 v5, v6 -; GFX8-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX8-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 @@ -7333,8 +8343,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB17_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX8-NEXT: s_cbranch_execnz .LBB21_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -7342,18 +8352,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB17_3 +; GFX8-NEXT: s_cbranch_execnz .LBB21_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 ; GFX7-NEXT: v_readfirstlane_b32 s10, v2 @@ -7365,7 +8375,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX7-NEXT: ; implicit-def: $vgpr4 ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB17_1 +; GFX7-NEXT: s_cbranch_execnz .LBB21_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 @@ -7376,9 +8386,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 -; GFX7-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX7-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 -; GFX7-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX7-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v7 @@ -7392,7 +8402,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: s_mov_b64 s[12:13], exec ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX7-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 @@ -7405,8 +8415,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB17_4 -; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX7-NEXT: s_cbranch_execnz .LBB21_4 +; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 @@ -7415,19 +8425,19 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v6 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB17_3 +; GFX7-NEXT: s_cbranch_execnz .LBB21_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_mov_b32_e32 v0, v7 ; GFX7-NEXT: v_mov_b32_e32 v1, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 ; GFX6-NEXT: v_readfirstlane_b32 s10, v2 @@ -7439,7 +8449,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX6-NEXT: ; implicit-def: $vgpr4 ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB17_1 +; GFX6-NEXT: s_cbranch_execnz .LBB21_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v6 @@ -7450,9 +8460,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX6-NEXT: s_mov_b64 s[6:7], 0 ; GFX6-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 ; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 -; GFX6-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX6-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 -; GFX6-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX6-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v7 @@ -7466,7 +8476,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: s_mov_b64 s[12:13], exec ; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX6-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 @@ -7479,8 +8489,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB17_4 -; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX6-NEXT: s_cbranch_execnz .LBB21_4 +; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 @@ -7490,14 +8500,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v6 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB17_3 +; GFX6-NEXT: s_cbranch_execnz .LBB21_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v0, v7 ; GFX6-NEXT: v_mov_b32_e32 v1, v4 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } @@ -7505,43 +8515,23 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; misc ; -------------------------------------------------------------------- -define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset: +define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_max_num_f32 v2, v1, v1 -; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: v_mov_b32_e32 v1, s6 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: v_max_num_f32_e32 v0, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f32_e32 v4, v0, v2 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB18_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset: +; GFX940-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, v0 @@ -7551,7 +8541,7 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 -; GFX940-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v0 @@ -7565,79 +8555,38 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB18_1 +; GFX940-NEXT: s_cbranch_execnz .LBB22_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset: +; GFX11-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1 -; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_mov_b32_e32 v1, s6 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX11-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc +; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB18_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset: +; GFX10-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 +; GFX10-NEXT: v_mov_b32_e32 v1, s18 ; GFX10-NEXT: s_mov_b32 s11, s17 ; GFX10-NEXT: s_mov_b32 s10, s16 ; GFX10-NEXT: s_mov_b32 s9, s7 ; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX10-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB18_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX10-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[8:11], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB18_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset: +; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 @@ -7651,7 +8600,7 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 -; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 @@ -7666,12 +8615,12 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset: +; GFX908-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 @@ -7685,7 +8634,7 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 -; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v5, v0 @@ -7699,12 +8648,12 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB18_1 +; GFX908-NEXT: s_cbranch_execnz .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset: +; GFX8-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -7718,7 +8667,7 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 -; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v5, v0 @@ -7732,83 +8681,42 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB18_1 +; GFX8-NEXT: s_cbranch_execnz .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset: +; GFX7-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v1, v0 ; GFX7-NEXT: s_mov_b32 s11, s17 ; GFX7-NEXT: s_mov_b32 s10, s16 ; GFX7-NEXT: s_mov_b32 s9, s7 ; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX7-NEXT: v_mov_b32_e32 v3, s6 -; GFX7-NEXT: .LBB18_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX7-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v0, v4 -; GFX7-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v1, s18 +; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[8:11], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB18_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset: +; GFX6-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, v0 ; GFX6-NEXT: s_mov_b32 s11, s17 ; GFX6-NEXT: s_mov_b32 s10, s16 ; GFX6-NEXT: s_mov_b32 s9, s7 ; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v0, s18 -; GFX6-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX6-NEXT: v_mov_b32_e32 v3, s6 -; GFX6-NEXT: .LBB18_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX6-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v0, v4 -; GFX6-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v1, s18 +; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[8:11], 0 offen offset:1024 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB18_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, float %val seq_cst + %result = atomicrmw fmin ptr addrspace(7) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" } - +attributes #0 = { nounwind } +!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll index a3424793fdc4d..ea2427a3c420f 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll @@ -929,29 +929,11 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4 -; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB6_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -982,56 +964,23 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX11-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX11-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc +; GFX11-NEXT: flat_atomic_max_f32 v0, v[0:1], v2 offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB6_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v0, v[3:4] -; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_max_f32_e32 v0, v6, v6 -; GFX10-NEXT: v_max_f32_e32 v5, v0, v1 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_fmax v0, v[0:1], v2 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB6_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1112,26 +1061,11 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX7-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fc, v0 -; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v0, v[3:4] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v6, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX7-NEXT: v_max_f32_e32 v5, v0, v1 -; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_fmax v0, v[0:1], v2 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB6_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr %ptr, i64 511 %result = atomicrmw fmax ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -1146,28 +1080,11 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 offset:2044 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB7_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1197,28 +1114,12 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX11-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_atomic_max_f32 v[0:1], v2 offset:2044 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB7_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1226,26 +1127,12 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX10-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: flat_atomic_fmax v[0:1], v2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB7_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1326,24 +1213,9 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: flat_atomic_fmax v[0:1], v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v3, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB7_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr %ptr, i64 511 %unused = atomicrmw fmax ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -1393,21 +1265,55 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX11-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_max_f32 v0, v[0:1], v2 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX10-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_fmax v0, v[0:1], v2 glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB8_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: @@ -1485,9 +1391,25 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX7-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_fmax v0, v[0:1], v2 glc +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v4, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; GFX7-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmax ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret float %result @@ -2557,29 +2479,11 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4 -; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB16_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2610,56 +2514,23 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX11-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX11-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc +; GFX11-NEXT: flat_atomic_max_f32 v0, v[0:1], v2 offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB16_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v0, v[3:4] -; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_max_f32_e32 v0, v6, v6 -; GFX10-NEXT: v_max_f32_e32 v5, v0, v1 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_fmax v0, v[0:1], v2 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB16_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2740,26 +2611,11 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX7-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fc, v0 -; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v0, v[3:4] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v6, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX7-NEXT: v_max_f32_e32 v5, v0, v1 -; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_fmax v0, v[0:1], v2 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB16_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr %ptr, i64 511 %result = atomicrmw fmax ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -2774,28 +2630,11 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 offset:2044 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB17_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2825,28 +2664,12 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX11-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_atomic_max_f32 v[0:1], v2 offset:2044 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB17_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2854,26 +2677,12 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX10-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: flat_atomic_fmax v[0:1], v2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB17_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2954,24 +2763,9 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: flat_atomic_fmax v[0:1], v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v3, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB17_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr %ptr, i64 511 %unused = atomicrmw fmax ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -4016,19 +3810,54 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v7, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX10-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB24_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX90A-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: @@ -4089,9 +3918,30 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX7-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: flat_load_dword v5, v[5:6] +; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX7-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB24_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret double %result @@ -15211,7 +15061,7 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ret void } -attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" } -attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } +attributes #0 = { nounwind } +attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } !0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll index 0d954e277cdd5..2767b66e44703 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll @@ -929,29 +929,11 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4 -; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB6_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -982,56 +964,23 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX11-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX11-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc +; GFX11-NEXT: flat_atomic_min_f32 v0, v[0:1], v2 offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB6_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v0, v[3:4] -; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_max_f32_e32 v0, v6, v6 -; GFX10-NEXT: v_min_f32_e32 v5, v0, v1 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB6_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1112,26 +1061,11 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX7-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fc, v0 -; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v0, v[3:4] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v6, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX7-NEXT: v_min_f32_e32 v5, v0, v1 -; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB6_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr %ptr, i64 511 %result = atomicrmw fmin ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -1146,28 +1080,11 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:2044 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB7_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1197,28 +1114,12 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX11-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_atomic_min_f32 v[0:1], v2 offset:2044 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB7_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1226,26 +1127,12 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX10-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: flat_atomic_fmin v[0:1], v2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB7_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1326,24 +1213,9 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX7-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: flat_atomic_fmin v[0:1], v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v3, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB7_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr %ptr, i64 511 %unused = atomicrmw fmin ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -1393,21 +1265,55 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX11-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_min_f32 v0, v[0:1], v2 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX10-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB8_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: @@ -1485,9 +1391,25 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v4, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; GFX7-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmin ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret float %result @@ -2557,29 +2479,11 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4 -; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB16_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2610,56 +2514,23 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX11-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX11-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc +; GFX11-NEXT: flat_atomic_min_f32 v0, v[0:1], v2 offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB16_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v0, v[3:4] -; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_max_f32_e32 v0, v6, v6 -; GFX10-NEXT: v_min_f32_e32 v5, v0, v1 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB16_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2740,26 +2611,11 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX7-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fc, v0 -; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v0, v[3:4] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v6, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX7-NEXT: v_min_f32_e32 v5, v0, v1 -; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB16_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr %ptr, i64 511 %result = atomicrmw fmin ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -2774,28 +2630,11 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:2044 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB17_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2825,28 +2664,12 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX11-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_atomic_min_f32 v[0:1], v2 offset:2044 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB17_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2854,26 +2677,12 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX10-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: flat_atomic_fmin v[0:1], v2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB17_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2954,24 +2763,9 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX7-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: flat_atomic_fmin v[0:1], v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v3, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB17_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr %ptr, i64 511 %unused = atomicrmw fmin ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -4016,19 +3810,54 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v7, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX10-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB24_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX90A-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: @@ -4089,9 +3918,30 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX7-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: flat_load_dword v5, v[5:6] +; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX7-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB24_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret double %result @@ -15211,7 +15061,7 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ret void } -attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" } -attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } +attributes #0 = { nounwind } +attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } !0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll index 4f7b6164936f8..06d971febd038 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll @@ -13,8 +13,8 @@ ; float ; -------------------------------------------------------------------- -define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f32: +define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -28,7 +28,7 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f32: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off @@ -53,7 +53,7 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f32: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -63,7 +63,7 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f32: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -73,7 +73,7 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off @@ -97,7 +97,7 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f32: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off @@ -121,7 +121,7 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f32: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] @@ -145,7 +145,7 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f32: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -158,7 +158,7 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f32: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -171,12 +171,12 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos: +define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -190,7 +190,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -215,7 +215,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -225,7 +225,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -235,7 +235,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -259,7 +259,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -283,7 +283,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -308,7 +308,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -321,7 +321,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -335,12 +335,12 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg: +define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -354,7 +354,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -379,7 +379,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -389,7 +389,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -399,7 +399,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -423,7 +423,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -447,7 +447,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 @@ -472,7 +472,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -485,7 +485,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -499,12 +499,12 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 - %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f32: +define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -518,7 +518,7 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f32: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off @@ -542,7 +542,7 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f32: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -552,7 +552,7 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f32: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -562,7 +562,7 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off @@ -585,7 +585,7 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f32: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off @@ -608,7 +608,7 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f32: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] @@ -631,7 +631,7 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f32: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -643,7 +643,7 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f32: +; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -655,12 +655,12 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos: +define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -674,7 +674,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -698,7 +698,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -708,7 +708,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -718,7 +718,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -741,7 +741,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -764,7 +764,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -789,7 +789,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -801,7 +801,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -814,12 +814,12 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg: +define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -833,7 +833,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -857,7 +857,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -867,7 +867,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -877,7 +877,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -900,7 +900,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -923,7 +923,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 @@ -948,7 +948,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -960,7 +960,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -973,44 +973,26 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 - %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define float @global_system_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos: +define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4 -; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB6_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1035,61 +1017,27 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX11-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc +; GFX11-NEXT: global_atomic_max_f32 v0, v[0:1], v2, off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB6_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX10-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB6_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1115,7 +1063,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1139,7 +1087,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -1164,105 +1112,53 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v5 -; GFX7-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB6_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v5 -; GFX6-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB6_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst + %result = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -define void @global_system_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos: +define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB7_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1286,59 +1182,27 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_max_f32 v[0:1], v2, off offset:2044 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB7_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX10-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_atomic_fmax v[0:1], v2, off offset:2044 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB7_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1363,7 +1227,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1386,7 +1250,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -1411,76 +1275,37 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX7-NEXT: v_mov_b32_e32 v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v5, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB7_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX6-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v5, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB7_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -; -------------------------------------------------------------------- -; float with ftz/daz -; -------------------------------------------------------------------- - -define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__ftz: +define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1494,7 +1319,7 @@ define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__ftz: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off @@ -1519,27 +1344,61 @@ define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__ftz: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX11-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_max_f32 v0, v[0:1], v2, off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__ftz: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX10-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB8_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__ftz: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off @@ -1563,7 +1422,7 @@ define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__ftz: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off @@ -1587,7 +1446,7 @@ define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__ftz: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] @@ -1611,38 +1470,73 @@ define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__ftz: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v5 +; GFX7-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__ftz: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v5 +; GFX6-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB8_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret float %result } -define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz: +define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1651,15 +1545,15 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start @@ -1669,7 +1563,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 ; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 @@ -1681,30 +1575,30 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_max_f32 v0, v[0:1], v2, off offset:2044 glc +; GFX11-NEXT: global_atomic_max_f32 v0, v[0:1], v2, off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off offset:2044 glc +; GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start @@ -1713,7 +1607,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 ; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 @@ -1725,10 +1619,10 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start @@ -1737,7 +1631,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 ; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 @@ -1749,64 +1643,66 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX8-NEXT: v_max_f32_e32 v5, v0, v1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; GFX8-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB9_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret float %result } -define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz: +; -------------------------------------------------------------------- +; float with ftz/daz +; -------------------------------------------------------------------- + +define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1815,15 +1711,15 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start @@ -1833,7 +1729,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 ; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 @@ -1845,30 +1741,30 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_max_f32 v0, v[0:1], v2, off offset:-2048 glc +; GFX11-NEXT: global_atomic_max_f32 v0, v[0:1], v2, off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off offset:-2048 glc +; GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start @@ -1877,7 +1773,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 ; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 @@ -1889,10 +1785,10 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start @@ -1901,7 +1797,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 ; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 @@ -1913,64 +1809,62 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX8-NEXT: v_max_f32_e32 v5, v0, v1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; GFX8-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s5, -1 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s5, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 - %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -define void @global_agent_atomic_fmax_noret_f32__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__ftz: +define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1979,154 +1873,162 @@ define void @global_agent_atomic_fmax_noret_f32__ftz(ptr addrspace(1) %ptr, floa ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off scope:SCOPE_DEV -; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__ftz: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB11_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__ftz: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_max_f32 v[0:1], v2, off -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_max_f32 v0, v[0:1], v2, off offset:2044 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__ftz: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax v[0:1], v2, off -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off offset:2044 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__ftz: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__ftz: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__ftz: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 +; GFX8-NEXT: v_max_f32_e32 v5, v0, v1 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__ftz: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__ftz: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst - ret void + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 + %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret float %result } -define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz: +define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -2135,157 +2037,162 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspac ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV -; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB12_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_max_f32 v[0:1], v2, off offset:2044 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_max_f32 v0, v[0:1], v2, off offset:-2048 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax v[0:1], v2, off offset:2044 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off offset:-2048 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB12_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 +; GFX8-NEXT: v_max_f32_e32 v5, v0, v1 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB12_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_movk_i32 s4, 0xf800 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s5, -1 +; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_movk_i32 s4, 0xf800 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s5, -1 +; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst - ret void + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 + %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret float %result } -define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz: +define void @global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -2294,15 +2201,15 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start @@ -2311,7 +2218,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 @@ -2323,30 +2230,30 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_max_f32 v[0:1], v2, off offset:-2048 +; GFX11-NEXT: global_atomic_max_f32 v[0:1], v2, off ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax v[0:1], v2, off offset:-2048 +; GFX10-NEXT: global_atomic_fmax v[0:1], v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start @@ -2354,7 +2261,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 @@ -2366,10 +2273,10 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start @@ -2377,7 +2284,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 @@ -2389,11 +2296,9 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 @@ -2414,324 +2319,212 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s5, -1 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz: +; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s5, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 - %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz: +define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4 -; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2 -; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB14_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB14_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX11-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_max_f32 v[0:1], v2, off offset:2044 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB14_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX10-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_atomic_fmax v[0:1], v2, off offset:2044 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB14_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 ; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX8-NEXT: v_max_f32_e32 v5, v0, v1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v5 -; GFX7-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB14_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz: +; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v5 -; GFX6-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB14_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst - ret float %result + %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void } -define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz: +define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 -; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB15_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB15_1: ; %atomicrmw.start @@ -2739,10 +2532,10 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 @@ -2752,62 +2545,30 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_max_f32 v[0:1], v2, off offset:-2048 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB15_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX10-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_atomic_fmax v[0:1], v2, off offset:-2048 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB15_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start @@ -2815,10 +2576,8 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] @@ -2829,10 +2588,10 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start @@ -2840,7 +2599,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 @@ -2852,11 +2611,11 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 @@ -2877,86 +2636,723 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_movk_i32 s4, 0xf800 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s5, -1 +; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_movk_i32 s4, 0xf800 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s5, -1 +; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 + %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB16_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_max_f32 v0, v[0:1], v2, off offset:2044 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off offset:2044 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB16_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 +; GFX8-NEXT: v_max_f32_e32 v5, v0, v1 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB16_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX7-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX7-NEXT: v_mov_b32_e32 v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v5, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB15_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz: +; GFX6-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX6-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX6-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v5, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB15_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 + %result = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 + ret float %result +} + +define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB17_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_max_f32 v[0:1], v2, off offset:2044 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_fmax v[0:1], v2, off offset:2044 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB17_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 +; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB17_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 + %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +; -------------------------------------------------------------------- +; double +; -------------------------------------------------------------------- + +define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB18_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB18_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB18_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB18_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret double %result +} + +define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB19_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:2040 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB19_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off offset:2040 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:2040 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB19_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB19_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst - ret void + %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255 + %result = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret double %result } -; -------------------------------------------------------------------- -; double -; -------------------------------------------------------------------- - -define double @global_agent_atomic_fmax_ret_f64(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f64: +define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -2965,35 +3361,35 @@ define double @global_agent_atomic_fmax_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB16_1 +; GFX12-NEXT: s_cbranch_execnz .LBB20_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f64: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off sc0 +; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f64: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 ; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -3001,7 +3397,7 @@ define double @global_agent_atomic_fmax_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] ; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -3009,89 +3405,89 @@ define double @global_agent_atomic_fmax_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB16_1 +; GFX11-NEXT: s_cbranch_execnz .LBB20_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f64: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc +; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f64: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 ; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] ; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB16_1 +; GFX908-NEXT: s_cbranch_execnz .LBB20_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f64: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v5 -; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB16_1 +; GFX8-NEXT: s_cbranch_execnz .LBB20_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f64: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_movk_i32 s4, 0xf800 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 @@ -3099,13 +3495,13 @@ define double @global_agent_atomic_fmax_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f64: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_movk_i32 s4, 0xf800 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 @@ -3113,368 +3509,350 @@ define double @global_agent_atomic_fmax_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst + %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256 + %result = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret double %result } -define double @global_agent_atomic_fmax_ret_f64__offset12b_pos(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos: +define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB17_1 +; GFX12-NEXT: s_cbranch_execnz .LBB21_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:2040 sc0 +; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB17_1 +; GFX11-NEXT: s_cbranch_execnz .LBB21_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off offset:2040 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[2:3], off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:2040 glc +; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v5, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB17_1 +; GFX908-NEXT: s_cbranch_execnz .LBB21_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB17_1 +; GFX8-NEXT: s_cbranch_execnz .LBB21_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc +; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc +; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255 - %result = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst - ret double %result + %unused = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void } -define double @global_agent_atomic_fmax_ret_f64__offset12b_neg(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg: +define void @global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB18_1 +; GFX12-NEXT: s_cbranch_execnz .LBB22_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off offset:2040 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 +; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB18_1 +; GFX11-NEXT: s_cbranch_execnz .LBB22_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off offset:-2048 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[2:3], off offset:2040 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off offset:2040 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v5, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB18_1 +; GFX908-NEXT: s_cbranch_execnz .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB18_1 +; GFX8-NEXT: s_cbranch_execnz .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s5, -1 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s5, -1 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256 - %result = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst - ret double %result + %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255 + %unused = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void } -define void @global_agent_atomic_fmax_noret_f64(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f64: +define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 ; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] @@ -3482,7 +3860,7 @@ define void @global_agent_atomic_fmax_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] @@ -3490,34 +3868,34 @@ define void @global_agent_atomic_fmax_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB19_1 +; GFX12-NEXT: s_cbranch_execnz .LBB23_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f64: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off +; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off offset:-2048 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f64: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 ; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -3526,41 +3904,41 @@ define void @global_agent_atomic_fmax_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB19_1 +; GFX11-NEXT: s_cbranch_execnz .LBB23_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f64: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[2:3], off +; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[2:3], off offset:-2048 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off +; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off offset:-2048 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f64: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 ; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc +; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] @@ -3568,18 +3946,20 @@ define void @global_agent_atomic_fmax_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB19_1 +; GFX908-NEXT: s_cbranch_execnz .LBB23_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f64: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -3592,375 +3972,466 @@ define void @global_agent_atomic_fmax_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB19_1 +; GFX8-NEXT: s_cbranch_execnz .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f64: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_movk_i32 s4, 0xf800 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f64: +; GFX6-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_movk_i32 s4, 0xf800 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst + %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256 + %unused = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmax_noret_f64__offset12b_pos(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos: +define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB20_1 +; GFX12-NEXT: s_cbranch_execnz .LBB24_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off offset:2040 +; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB20_1 +; GFX11-NEXT: s_cbranch_execnz .LBB24_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v7, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX10-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[2:3], off offset:2040 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB24_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off offset:2040 +; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX90A-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB20_1 +; GFX908-NEXT: s_cbranch_execnz .LBB24_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB20_1 +; GFX8-NEXT: s_cbranch_execnz .LBB24_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-NEXT: v_mov_b32_e32 v4, v0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 +; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v11, v1 +; GFX7-NEXT: v_mov_b32_e32 v10, v0 +; GFX7-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11] +; GFX7-NEXT: v_max_f64 v[8:9], v[0:1], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v0, v8 +; GFX7-NEXT: v_mov_b32_e32 v1, v9 +; GFX7-NEXT: v_mov_b32_e32 v2, v10 +; GFX7-NEXT: v_mov_b32_e32 v3, v11 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB24_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_mov_b32_e32 v5, v1 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 +; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 +; GFX6-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v11, v1 +; GFX6-NEXT: v_mov_b32_e32 v10, v0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11] +; GFX6-NEXT: v_max_f64 v[8:9], v[0:1], v[6:7] +; GFX6-NEXT: v_mov_b32_e32 v0, v8 +; GFX6-NEXT: v_mov_b32_e32 v1, v9 +; GFX6-NEXT: v_mov_b32_e32 v2, v10 +; GFX6-NEXT: v_mov_b32_e32 v3, v11 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB24_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255 - %unused = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst - ret void + %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 + ret double %result } -define void @global_agent_atomic_fmax_noret_f64__offset12b_neg(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg: +define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB21_1 +; GFX12-NEXT: s_cbranch_execnz .LBB25_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off offset:-2048 +; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB21_1 +; GFX11-NEXT: s_cbranch_execnz .LBB25_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[2:3], off offset:-2048 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off offset:-2048 +; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB21_1 +; GFX908-NEXT: s_cbranch_execnz .LBB25_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB21_1 +; GFX8-NEXT: s_cbranch_execnz .LBB25_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s5, -1 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s5, -1 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256 - %unused = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst - ret void + %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 + ret double %result } ; -------------------------------------------------------------------- ; half ; -------------------------------------------------------------------- -define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f16: +define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -3978,7 +4449,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -4000,13 +4471,13 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB22_1 +; GFX12-NEXT: s_cbranch_execnz .LBB26_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f16: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v0 @@ -4019,7 +4490,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -4035,13 +4506,13 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB22_1 +; GFX940-NEXT: s_cbranch_execnz .LBB26_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f16: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v0 @@ -4055,7 +4526,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -4077,13 +4548,13 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB22_1 +; GFX11-NEXT: s_cbranch_execnz .LBB26_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f16: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 @@ -4095,7 +4566,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -4112,13 +4583,13 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB22_1 +; GFX10-NEXT: s_cbranch_execnz .LBB26_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 @@ -4131,7 +4602,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -4146,13 +4617,13 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f16: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v0 @@ -4165,7 +4636,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -4180,13 +4651,13 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB22_1 +; GFX908-NEXT: s_cbranch_execnz .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f16: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v0 @@ -4199,7 +4670,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -4215,13 +4686,13 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB22_1 +; GFX8-NEXT: s_cbranch_execnz .LBB26_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f16: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4238,7 +4709,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 ; GFX7-NEXT: v_not_b32_e32 v7, v2 -; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4 @@ -4257,14 +4728,14 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB22_1 +; GFX7-NEXT: s_cbranch_execnz .LBB26_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f16: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v3, v0 @@ -4281,7 +4752,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 ; GFX6-NEXT: v_not_b32_e32 v7, v2 -; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4 @@ -4300,19 +4771,19 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB22_1 +; GFX6-NEXT: s_cbranch_execnz .LBB26_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmax ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result } -define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos: +define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -4331,7 +4802,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -4353,13 +4824,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB23_1 +; GFX12-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -4374,7 +4845,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -4390,13 +4861,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB23_1 +; GFX940-NEXT: s_cbranch_execnz .LBB27_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -4411,7 +4882,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -4433,13 +4904,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB23_1 +; GFX11-NEXT: s_cbranch_execnz .LBB27_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -4452,7 +4923,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -4469,13 +4940,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB23_1 +; GFX10-NEXT: s_cbranch_execnz .LBB27_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -4489,7 +4960,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -4504,13 +4975,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -4524,7 +4995,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -4539,13 +5010,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB23_1 +; GFX908-NEXT: s_cbranch_execnz .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 @@ -4559,7 +5030,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -4575,13 +5046,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB23_1 +; GFX8-NEXT: s_cbranch_execnz .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -4599,7 +5070,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 -; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -4618,14 +5089,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB23_1 +; GFX7-NEXT: s_cbranch_execnz .LBB27_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -4643,7 +5114,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 -; GFX6-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -4663,7 +5134,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB23_1 +; GFX6-NEXT: s_cbranch_execnz .LBB27_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 @@ -4671,12 +5142,12 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result } -define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg: +define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -4695,7 +5166,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -4717,13 +5188,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB24_1 +; GFX12-NEXT: s_cbranch_execnz .LBB28_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_movk_i32 s0, 0xf800 @@ -4739,7 +5210,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -4755,13 +5226,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB24_1 +; GFX940-NEXT: s_cbranch_execnz .LBB28_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 @@ -4776,7 +5247,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -4798,13 +5269,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB24_1 +; GFX11-NEXT: s_cbranch_execnz .LBB28_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 @@ -4817,7 +5288,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -4834,13 +5305,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB24_1 +; GFX10-NEXT: s_cbranch_execnz .LBB28_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 @@ -4854,7 +5325,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -4869,13 +5340,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 @@ -4889,7 +5360,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -4904,13 +5375,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB24_1 +; GFX908-NEXT: s_cbranch_execnz .LBB28_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 @@ -4924,7 +5395,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -4940,13 +5411,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB24_1 +; GFX8-NEXT: s_cbranch_execnz .LBB28_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -4964,7 +5435,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 -; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -4983,14 +5454,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB24_1 +; GFX7-NEXT: s_cbranch_execnz .LBB28_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -5008,7 +5479,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 -; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -5028,7 +5499,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB24_1 +; GFX6-NEXT: s_cbranch_execnz .LBB28_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 @@ -5036,12 +5507,12 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024 - %result = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result } -define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f16: +define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -5059,7 +5530,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -5081,12 +5552,12 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB25_1 +; GFX12-NEXT: s_cbranch_execnz .LBB29_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f16: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v0 @@ -5099,7 +5570,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX940-NEXT: v_not_b32_e32 v6, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -5115,12 +5586,12 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB25_1 +; GFX940-NEXT: s_cbranch_execnz .LBB29_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f16: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v0 @@ -5134,7 +5605,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -5156,12 +5627,12 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB25_1 +; GFX11-NEXT: s_cbranch_execnz .LBB29_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f16: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 @@ -5173,7 +5644,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -5190,12 +5661,12 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB25_1 +; GFX10-NEXT: s_cbranch_execnz .LBB29_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 @@ -5208,7 +5679,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -5223,12 +5694,12 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB25_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB29_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f16: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v0 @@ -5241,7 +5712,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -5256,12 +5727,12 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB25_1 +; GFX908-NEXT: s_cbranch_execnz .LBB29_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f16: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v0 @@ -5274,7 +5745,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -5290,12 +5761,12 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB25_1 +; GFX8-NEXT: s_cbranch_execnz .LBB29_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f16: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5312,7 +5783,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 @@ -5331,12 +5802,12 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB25_1 +; GFX7-NEXT: s_cbranch_execnz .LBB29_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f16: +; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v3, v0 @@ -5353,7 +5824,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX6-NEXT: v_not_b32_e32 v6, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v4 @@ -5373,17 +5844,17 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB25_1 +; GFX6-NEXT: s_cbranch_execnz .LBB29_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmax ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos: +define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -5402,7 +5873,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5424,12 +5895,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB26_1 +; GFX12-NEXT: s_cbranch_execnz .LBB30_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -5444,7 +5915,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: v_not_b32_e32 v5, v5 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5460,12 +5931,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB26_1 +; GFX940-NEXT: s_cbranch_execnz .LBB30_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -5480,7 +5951,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5502,12 +5973,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB26_1 +; GFX11-NEXT: s_cbranch_execnz .LBB30_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -5520,7 +5991,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5537,12 +6008,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB26_1 +; GFX10-NEXT: s_cbranch_execnz .LBB30_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -5556,7 +6027,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5571,12 +6042,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -5590,7 +6061,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5605,12 +6076,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB26_1 +; GFX908-NEXT: s_cbranch_execnz .LBB30_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 @@ -5624,7 +6095,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5640,12 +6111,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB26_1 +; GFX8-NEXT: s_cbranch_execnz .LBB30_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -5663,7 +6134,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5682,12 +6153,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB26_1 +; GFX7-NEXT: s_cbranch_execnz .LBB30_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -5705,7 +6176,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 -; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5725,18 +6196,18 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB26_1 +; GFX6-NEXT: s_cbranch_execnz .LBB30_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg: +define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -5755,7 +6226,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5777,12 +6248,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB27_1 +; GFX12-NEXT: s_cbranch_execnz .LBB31_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_movk_i32 s0, 0xf800 @@ -5798,7 +6269,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: v_not_b32_e32 v5, v5 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5814,12 +6285,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB27_1 +; GFX940-NEXT: s_cbranch_execnz .LBB31_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 @@ -5834,7 +6305,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5856,12 +6327,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-NEXT: s_cbranch_execnz .LBB31_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 @@ -5874,7 +6345,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5891,12 +6362,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB27_1 +; GFX10-NEXT: s_cbranch_execnz .LBB31_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 @@ -5910,7 +6381,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5925,12 +6396,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 @@ -5944,7 +6415,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5959,12 +6430,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB27_1 +; GFX908-NEXT: s_cbranch_execnz .LBB31_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 @@ -5978,7 +6449,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5994,12 +6465,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB27_1 +; GFX8-NEXT: s_cbranch_execnz .LBB31_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -6017,7 +6488,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6036,12 +6507,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB27_1 +; GFX7-NEXT: s_cbranch_execnz .LBB31_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -6059,7 +6530,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 -; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6079,18 +6550,18 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB27_1 +; GFX6-NEXT: s_cbranch_execnz .LBB31_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024 - %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4: +define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -6100,7 +6571,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -6119,20 +6590,20 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB28_1 +; GFX12-NEXT: s_cbranch_execnz .LBB32_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX940-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 @@ -6146,19 +6617,19 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB28_1 +; GFX940-NEXT: s_cbranch_execnz .LBB32_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -6177,19 +6648,19 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB28_1 +; GFX11-NEXT: s_cbranch_execnz .LBB32_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -6205,20 +6676,20 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB28_1 +; GFX10-NEXT: s_cbranch_execnz .LBB32_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -6231,20 +6702,20 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -6257,13 +6728,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB28_1 +; GFX908-NEXT: s_cbranch_execnz .LBB32_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 @@ -6271,7 +6742,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v1, v2, v2 -; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -6285,12 +6756,12 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB28_1 +; GFX8-NEXT: s_cbranch_execnz .LBB32_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -6301,7 +6772,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -6318,13 +6789,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB28_1 +; GFX7-NEXT: s_cbranch_execnz .LBB32_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -6335,7 +6806,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX6-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -6353,19 +6824,19 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB28_1 +; GFX6-NEXT: s_cbranch_execnz .LBB32_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4 + %result = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret half %result } -define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos: +define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -6375,7 +6846,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-NEXT: v_max_num_f16_e32 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f16_e32 v2, v3, v3 @@ -6394,19 +6865,19 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB29_1 +; GFX12-NEXT: s_cbranch_execnz .LBB33_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_max_f16_e32 v2, v3, v3 @@ -6420,18 +6891,18 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB29_1 +; GFX940-NEXT: s_cbranch_execnz .LBB33_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f16_e32 v2, v3, v3 @@ -6450,18 +6921,18 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB29_1 +; GFX11-NEXT: s_cbranch_execnz .LBB33_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX10-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_max_f16_e32 v2, v3, v3 @@ -6477,19 +6948,19 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB29_1 +; GFX10-NEXT: s_cbranch_execnz .LBB33_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f16_e32 v2, v3, v3 @@ -6502,19 +6973,19 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB29_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f16_e32 v2, v3, v3 @@ -6527,12 +6998,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB29_1 +; GFX908-NEXT: s_cbranch_execnz .LBB33_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 @@ -6540,7 +7011,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v4, v2, v2 -; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_e32 v2, v3, v3 @@ -6554,12 +7025,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB29_1 +; GFX8-NEXT: s_cbranch_execnz .LBB33_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -6570,7 +7041,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -6587,12 +7058,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB29_1 +; GFX7-NEXT: s_cbranch_execnz .LBB33_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos: +; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -6603,7 +7074,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -6621,18 +7092,18 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB29_1 +; GFX6-NEXT: s_cbranch_execnz .LBB33_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4 + %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret void } -define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos: +define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -6651,7 +7122,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -6673,13 +7144,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB30_1 +; GFX12-NEXT: s_cbranch_execnz .LBB34_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -6694,7 +7165,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -6710,13 +7181,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB30_1 +; GFX940-NEXT: s_cbranch_execnz .LBB34_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -6731,7 +7202,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -6753,13 +7224,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB30_1 +; GFX11-NEXT: s_cbranch_execnz .LBB34_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -6772,7 +7243,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -6789,13 +7260,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB30_1 +; GFX10-NEXT: s_cbranch_execnz .LBB34_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -6809,7 +7280,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -6826,13 +7297,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -6846,7 +7317,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -6861,13 +7332,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB30_1 +; GFX908-NEXT: s_cbranch_execnz .LBB34_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 @@ -6881,7 +7352,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -6897,13 +7368,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB30_1 +; GFX8-NEXT: s_cbranch_execnz .LBB34_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -6921,7 +7392,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 -; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -6940,14 +7411,14 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB30_1 +; GFX7-NEXT: s_cbranch_execnz .LBB34_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -6965,7 +7436,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 -; GFX6-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -6985,7 +7456,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB30_1 +; GFX6-NEXT: s_cbranch_execnz .LBB34_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 @@ -6993,12 +7464,12 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fmax ptr addrspace(1) %gep, half %val seq_cst + %result = atomicrmw fmax ptr addrspace(1) %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result } -define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos: +define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -7017,7 +7488,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7039,12 +7510,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB31_1 +; GFX12-NEXT: s_cbranch_execnz .LBB35_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -7059,7 +7530,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: v_not_b32_e32 v5, v5 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7075,12 +7546,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB31_1 +; GFX940-NEXT: s_cbranch_execnz .LBB35_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -7095,7 +7566,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7117,12 +7588,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB31_1 +; GFX11-NEXT: s_cbranch_execnz .LBB35_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -7135,7 +7606,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7152,12 +7623,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB31_1 +; GFX10-NEXT: s_cbranch_execnz .LBB35_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -7171,7 +7642,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7188,12 +7659,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB35_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -7207,7 +7678,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7222,12 +7693,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB31_1 +; GFX908-NEXT: s_cbranch_execnz .LBB35_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 @@ -7241,7 +7712,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7257,12 +7728,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB31_1 +; GFX8-NEXT: s_cbranch_execnz .LBB35_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -7280,7 +7751,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7299,12 +7770,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB31_1 +; GFX7-NEXT: s_cbranch_execnz .LBB35_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -7322,7 +7793,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 -; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7342,13 +7813,13 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB31_1 +; GFX6-NEXT: s_cbranch_execnz .LBB35_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -7356,8 +7827,8 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; bfloat ; -------------------------------------------------------------------- -define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16: +define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -7374,7 +7845,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -7403,13 +7874,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB32_1 +; GFX12-NEXT: s_cbranch_execnz .LBB36_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16: +; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v0 @@ -7423,7 +7894,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -7445,13 +7916,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB32_1 +; GFX940-NEXT: s_cbranch_execnz .LBB36_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16: +; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 @@ -7465,7 +7936,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -7494,13 +7965,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB32_1 +; GFX11-NEXT: s_cbranch_execnz .LBB36_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16: +; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 @@ -7512,7 +7983,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -7533,13 +8004,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB32_1 +; GFX10-NEXT: s_cbranch_execnz .LBB36_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 @@ -7553,7 +8024,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -7572,13 +8043,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB36_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16: +; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v0 @@ -7592,7 +8063,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -7611,13 +8082,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB32_1 +; GFX908-NEXT: s_cbranch_execnz .LBB36_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16: +; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v0 @@ -7630,7 +8101,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -7651,13 +8122,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB32_1 +; GFX8-NEXT: s_cbranch_execnz .LBB36_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16: +; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -7674,7 +8145,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4 @@ -7694,14 +8165,14 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB32_1 +; GFX7-NEXT: s_cbranch_execnz .LBB36_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16: +; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v3, v0 @@ -7718,7 +8189,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX6-NEXT: v_not_b32_e32 v6, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4 @@ -7738,19 +8209,19 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB32_1 +; GFX6-NEXT: s_cbranch_execnz .LBB36_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmax ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } -define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos: +define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -7769,7 +8240,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -7798,13 +8269,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB33_1 +; GFX12-NEXT: s_cbranch_execnz .LBB37_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -7820,7 +8291,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -7842,13 +8313,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB33_1 +; GFX940-NEXT: s_cbranch_execnz .LBB37_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -7864,7 +8335,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -7893,13 +8364,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB33_1 +; GFX11-NEXT: s_cbranch_execnz .LBB37_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -7912,7 +8383,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -7933,13 +8404,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB33_1 +; GFX10-NEXT: s_cbranch_execnz .LBB37_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -7954,7 +8425,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -7973,13 +8444,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB37_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -7994,7 +8465,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -8013,13 +8484,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB33_1 +; GFX908-NEXT: s_cbranch_execnz .LBB37_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 @@ -8033,7 +8504,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -8054,13 +8525,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB33_1 +; GFX8-NEXT: s_cbranch_execnz .LBB37_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -8078,7 +8549,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: v_not_b32_e32 v7, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -8098,14 +8569,14 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB33_1 +; GFX7-NEXT: s_cbranch_execnz .LBB37_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -8123,7 +8594,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: v_not_b32_e32 v7, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -8144,7 +8615,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB33_1 +; GFX6-NEXT: s_cbranch_execnz .LBB37_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 @@ -8152,12 +8623,12 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } -define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg: +define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -8176,7 +8647,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -8205,13 +8676,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB34_1 +; GFX12-NEXT: s_cbranch_execnz .LBB38_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_movk_i32 s0, 0xf800 @@ -8228,7 +8699,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -8250,13 +8721,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB34_1 +; GFX940-NEXT: s_cbranch_execnz .LBB38_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 @@ -8272,7 +8743,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -8301,13 +8772,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB34_1 +; GFX11-NEXT: s_cbranch_execnz .LBB38_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 @@ -8320,7 +8791,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -8341,13 +8812,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB34_1 +; GFX10-NEXT: s_cbranch_execnz .LBB38_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 @@ -8362,7 +8833,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -8381,13 +8852,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB38_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 @@ -8402,7 +8873,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -8421,13 +8892,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB34_1 +; GFX908-NEXT: s_cbranch_execnz .LBB38_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 @@ -8441,7 +8912,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -8462,13 +8933,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB34_1 +; GFX8-NEXT: s_cbranch_execnz .LBB38_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -8486,7 +8957,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: v_not_b32_e32 v7, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -8506,14 +8977,14 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB34_1 +; GFX7-NEXT: s_cbranch_execnz .LBB38_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -8531,7 +9002,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: v_not_b32_e32 v7, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX6-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -8552,7 +9023,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB34_1 +; GFX6-NEXT: s_cbranch_execnz .LBB38_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 @@ -8560,12 +9031,12 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024 - %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } -define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16: +define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -8582,7 +9053,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -8610,12 +9081,12 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB35_1 +; GFX12-NEXT: s_cbranch_execnz .LBB39_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16: +; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v0 @@ -8629,7 +9100,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -8651,12 +9122,12 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB35_1 +; GFX940-NEXT: s_cbranch_execnz .LBB39_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16: +; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 @@ -8670,7 +9141,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v6, v3 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -8698,12 +9169,12 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB35_1 +; GFX11-NEXT: s_cbranch_execnz .LBB39_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16: +; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 @@ -8715,7 +9186,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -8736,12 +9207,12 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB35_1 +; GFX10-NEXT: s_cbranch_execnz .LBB39_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 @@ -8755,7 +9226,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -8774,12 +9245,12 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB35_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB39_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16: +; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v0 @@ -8793,7 +9264,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -8812,12 +9283,12 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB35_1 +; GFX908-NEXT: s_cbranch_execnz .LBB39_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16: +; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v0 @@ -8830,7 +9301,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -8851,12 +9322,12 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB35_1 +; GFX8-NEXT: s_cbranch_execnz .LBB39_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16: +; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -8873,7 +9344,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -8893,12 +9364,12 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB35_1 +; GFX7-NEXT: s_cbranch_execnz .LBB39_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16: +; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v3, v0 @@ -8915,7 +9386,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX6-NEXT: v_not_b32_e32 v6, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX6-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -8936,17 +9407,17 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB35_1 +; GFX6-NEXT: s_cbranch_execnz .LBB39_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmax ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos: +define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -8965,7 +9436,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -8993,12 +9464,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB36_1 +; GFX12-NEXT: s_cbranch_execnz .LBB40_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -9014,7 +9485,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9036,12 +9507,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB36_1 +; GFX940-NEXT: s_cbranch_execnz .LBB40_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -9057,7 +9528,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v5, v5 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9085,12 +9556,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB36_1 +; GFX11-NEXT: s_cbranch_execnz .LBB40_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -9103,7 +9574,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9124,12 +9595,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB36_1 +; GFX10-NEXT: s_cbranch_execnz .LBB40_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -9144,7 +9615,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9163,12 +9634,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB36_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB40_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -9183,7 +9654,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9202,12 +9673,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB36_1 +; GFX908-NEXT: s_cbranch_execnz .LBB40_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 @@ -9221,7 +9692,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9242,12 +9713,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB36_1 +; GFX8-NEXT: s_cbranch_execnz .LBB40_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -9265,7 +9736,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9285,12 +9756,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB36_1 +; GFX7-NEXT: s_cbranch_execnz .LBB40_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -9308,7 +9779,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: v_not_b32_e32 v5, v5 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9329,18 +9800,18 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB36_1 +; GFX6-NEXT: s_cbranch_execnz .LBB40_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg: +define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -9359,7 +9830,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9387,12 +9858,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB37_1 +; GFX12-NEXT: s_cbranch_execnz .LBB41_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_movk_i32 s0, 0xf800 @@ -9409,7 +9880,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9431,12 +9902,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB37_1 +; GFX940-NEXT: s_cbranch_execnz .LBB41_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 @@ -9452,7 +9923,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v5, v5 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9480,12 +9951,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB37_1 +; GFX11-NEXT: s_cbranch_execnz .LBB41_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 @@ -9498,7 +9969,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9519,12 +9990,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB37_1 +; GFX10-NEXT: s_cbranch_execnz .LBB41_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 @@ -9539,7 +10010,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9558,12 +10029,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB37_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB41_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 @@ -9578,7 +10049,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9597,12 +10068,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB37_1 +; GFX908-NEXT: s_cbranch_execnz .LBB41_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 @@ -9616,7 +10087,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9637,12 +10108,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB37_1 +; GFX8-NEXT: s_cbranch_execnz .LBB41_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -9660,7 +10131,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9680,12 +10151,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB37_1 +; GFX7-NEXT: s_cbranch_execnz .LBB41_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -9703,7 +10174,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: v_not_b32_e32 v5, v5 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9724,18 +10195,18 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB37_1 +; GFX6-NEXT: s_cbranch_execnz .LBB41_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024 - %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4: +define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -9745,7 +10216,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -9771,13 +10242,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB38_1 +; GFX12-NEXT: s_cbranch_execnz .LBB42_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4: +; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 @@ -9785,7 +10256,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff ; GFX940-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX940-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 @@ -9806,20 +10277,20 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB38_1 +; GFX940-NEXT: s_cbranch_execnz .LBB42_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4: +; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -9845,19 +10316,19 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB38_1 +; GFX11-NEXT: s_cbranch_execnz .LBB42_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4: +; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -9878,13 +10349,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB38_1 +; GFX10-NEXT: s_cbranch_execnz .LBB42_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 @@ -9892,7 +10363,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -9911,13 +10382,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB38_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB42_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4: +; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 @@ -9925,7 +10396,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX908-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -9944,13 +10415,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB38_1 +; GFX908-NEXT: s_cbranch_execnz .LBB42_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4: +; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 @@ -9958,7 +10429,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -9978,12 +10449,12 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB38_1 +; GFX8-NEXT: s_cbranch_execnz .LBB42_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4: +; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -9994,7 +10465,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10012,13 +10483,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB38_1 +; GFX7-NEXT: s_cbranch_execnz .LBB42_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4: +; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -10029,7 +10500,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10048,19 +10519,19 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB38_1 +; GFX6-NEXT: s_cbranch_execnz .LBB42_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4 + %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } -define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos: +define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -10070,7 +10541,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10095,12 +10566,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB39_1 +; GFX12-NEXT: s_cbranch_execnz .LBB43_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos: +; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 @@ -10108,7 +10579,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff ; GFX940-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX940-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10129,19 +10600,19 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB39_1 +; GFX940-NEXT: s_cbranch_execnz .LBB43_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos: +; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10166,18 +10637,18 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB39_1 +; GFX11-NEXT: s_cbranch_execnz .LBB43_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos: +; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10198,12 +10669,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB39_1 +; GFX10-NEXT: s_cbranch_execnz .LBB43_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 @@ -10211,7 +10682,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX90A-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10230,12 +10701,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB39_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB43_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos: +; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 @@ -10243,7 +10714,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX908-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10262,12 +10733,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB39_1 +; GFX908-NEXT: s_cbranch_execnz .LBB43_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos: +; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 @@ -10275,7 +10746,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10295,12 +10766,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB39_1 +; GFX8-NEXT: s_cbranch_execnz .LBB43_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos: +; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -10311,7 +10782,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10329,12 +10800,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB39_1 +; GFX7-NEXT: s_cbranch_execnz .LBB43_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos: +; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -10345,7 +10816,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX6-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10364,18 +10835,18 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB39_1 +; GFX6-NEXT: s_cbranch_execnz .LBB43_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4 + %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret void } -define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos: +define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -10394,7 +10865,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -10423,13 +10894,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB40_1 +; GFX12-NEXT: s_cbranch_execnz .LBB44_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -10445,7 +10916,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -10467,13 +10938,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB40_1 +; GFX940-NEXT: s_cbranch_execnz .LBB44_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -10489,7 +10960,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -10518,13 +10989,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB40_1 +; GFX11-NEXT: s_cbranch_execnz .LBB44_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -10537,7 +11008,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -10558,13 +11029,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB40_1 +; GFX10-NEXT: s_cbranch_execnz .LBB44_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -10579,7 +11050,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -10600,13 +11071,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB40_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB44_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -10621,7 +11092,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -10640,13 +11111,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB40_1 +; GFX908-NEXT: s_cbranch_execnz .LBB44_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 @@ -10660,7 +11131,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -10681,13 +11152,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB40_1 +; GFX8-NEXT: s_cbranch_execnz .LBB44_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -10705,7 +11176,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: v_not_b32_e32 v7, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -10725,14 +11196,14 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB40_1 +; GFX7-NEXT: s_cbranch_execnz .LBB44_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -10750,7 +11221,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: v_not_b32_e32 v7, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX6-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -10771,7 +11242,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB40_1 +; GFX6-NEXT: s_cbranch_execnz .LBB44_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 @@ -10779,12 +11250,12 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val seq_cst + %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } -define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos: +define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -10803,7 +11274,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -10831,12 +11302,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB41_1 +; GFX12-NEXT: s_cbranch_execnz .LBB45_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -10852,7 +11323,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -10874,12 +11345,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB41_1 +; GFX940-NEXT: s_cbranch_execnz .LBB45_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -10895,7 +11366,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v5, v5 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -10923,12 +11394,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB41_1 +; GFX11-NEXT: s_cbranch_execnz .LBB45_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -10941,7 +11412,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -10962,12 +11433,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB41_1 +; GFX10-NEXT: s_cbranch_execnz .LBB45_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -10982,7 +11453,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11003,12 +11474,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB41_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB45_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -11023,7 +11494,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11042,12 +11513,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB41_1 +; GFX908-NEXT: s_cbranch_execnz .LBB45_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 @@ -11061,7 +11532,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11082,12 +11553,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB41_1 +; GFX8-NEXT: s_cbranch_execnz .LBB45_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -11105,7 +11576,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -11125,12 +11596,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB41_1 +; GFX7-NEXT: s_cbranch_execnz .LBB45_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -11148,7 +11619,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: v_not_b32_e32 v5, v5 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -11169,13 +11640,13 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB41_1 +; GFX6-NEXT: s_cbranch_execnz .LBB45_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -11183,8 +11654,8 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; <2 x half> ; -------------------------------------------------------------------- -define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_v2f16: +define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -11194,7 +11665,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -11210,19 +11681,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB42_1 +; GFX12-NEXT: s_cbranch_execnz .LBB46_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16: +; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX940-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 @@ -11236,19 +11707,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB42_1 +; GFX940-NEXT: s_cbranch_execnz .LBB46_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16: +; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -11264,19 +11735,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB42_1 +; GFX11-NEXT: s_cbranch_execnz .LBB46_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_v2f16: +; GFX10-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -11290,19 +11761,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB42_1 +; GFX10-NEXT: s_cbranch_execnz .LBB46_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2f16: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX90A-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -11314,19 +11785,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB42_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB46_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16: +; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX908-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -11338,20 +11809,20 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB42_1 +; GFX908-NEXT: s_cbranch_execnz .LBB46_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_v2f16: +; GFX8-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v3 @@ -11366,13 +11837,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB42_1 +; GFX8-NEXT: s_cbranch_execnz .LBB46_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_v2f16: +; GFX7-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -11389,7 +11860,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -11414,14 +11885,14 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB42_1 +; GFX7-NEXT: s_cbranch_execnz .LBB46_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16: +; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -11438,7 +11909,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX6-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -11464,19 +11935,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB42_1 +; GFX6-NEXT: s_cbranch_execnz .LBB46_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } -define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos: +define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -11486,7 +11957,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -11502,19 +11973,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB43_1 +; GFX12-NEXT: s_cbranch_execnz .LBB47_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX940-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 @@ -11528,19 +11999,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB43_1 +; GFX940-NEXT: s_cbranch_execnz .LBB47_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -11556,19 +12027,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB43_1 +; GFX11-NEXT: s_cbranch_execnz .LBB47_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -11582,19 +12053,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB43_1 +; GFX10-NEXT: s_cbranch_execnz .LBB47_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX90A-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -11606,19 +12077,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB43_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB47_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX908-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -11630,13 +12101,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB43_1 +; GFX908-NEXT: s_cbranch_execnz .LBB47_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -11645,7 +12116,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -11660,12 +12131,12 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB43_1 +; GFX8-NEXT: s_cbranch_execnz .LBB47_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -11682,7 +12153,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -11707,14 +12178,14 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB43_1 +; GFX7-NEXT: s_cbranch_execnz .LBB47_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -11731,7 +12202,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX6-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -11757,7 +12228,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB43_1 +; GFX6-NEXT: s_cbranch_execnz .LBB47_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v2 @@ -11765,12 +12236,12 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } -define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg: +define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -11780,7 +12251,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -11796,19 +12267,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB44_1 +; GFX12-NEXT: s_cbranch_execnz .LBB48_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX940-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 @@ -11822,19 +12293,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB44_1 +; GFX940-NEXT: s_cbranch_execnz .LBB48_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -11850,19 +12321,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB44_1 +; GFX11-NEXT: s_cbranch_execnz .LBB48_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -11876,19 +12347,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB44_1 +; GFX10-NEXT: s_cbranch_execnz .LBB48_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -11900,19 +12371,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB44_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB48_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX908-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -11924,13 +12395,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB44_1 +; GFX908-NEXT: s_cbranch_execnz .LBB48_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 @@ -11939,7 +12410,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -11954,12 +12425,12 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB44_1 +; GFX8-NEXT: s_cbranch_execnz .LBB48_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -11980,7 +12451,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -12005,12 +12476,12 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB44_1 +; GFX7-NEXT: s_cbranch_execnz .LBB48_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -12031,7 +12502,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -12057,18 +12528,18 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB44_1 +; GFX6-NEXT: s_cbranch_execnz .LBB48_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 - %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } -define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_v2f16: +define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -12078,7 +12549,7 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX12-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 @@ -12094,18 +12565,18 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB45_1 +; GFX12-NEXT: s_cbranch_execnz .LBB49_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16: +; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX940-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12119,18 +12590,18 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB45_1 +; GFX940-NEXT: s_cbranch_execnz .LBB49_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16: +; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off ; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12146,18 +12617,18 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB45_1 +; GFX11-NEXT: s_cbranch_execnz .LBB49_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16: +; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12171,18 +12642,18 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB45_1 +; GFX10-NEXT: s_cbranch_execnz .LBB49_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12194,18 +12665,18 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB45_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16: +; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12217,19 +12688,19 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB45_1 +; GFX908-NEXT: s_cbranch_execnz .LBB49_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16: +; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -12244,12 +12715,12 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB45_1 +; GFX8-NEXT: s_cbranch_execnz .LBB49_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_v2f16: +; GFX7-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -12266,7 +12737,7 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -12291,12 +12762,12 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB45_1 +; GFX7-NEXT: s_cbranch_execnz .LBB49_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_v2f16: +; GFX6-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -12313,7 +12784,7 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -12339,17 +12810,17 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB45_1 +; GFX6-NEXT: s_cbranch_execnz .LBB49_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos: +define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -12359,7 +12830,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 @@ -12375,18 +12846,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB46_1 +; GFX12-NEXT: s_cbranch_execnz .LBB50_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX940-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12400,18 +12871,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB46_1 +; GFX940-NEXT: s_cbranch_execnz .LBB50_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12427,18 +12898,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB46_1 +; GFX11-NEXT: s_cbranch_execnz .LBB50_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12452,18 +12923,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB46_1 +; GFX10-NEXT: s_cbranch_execnz .LBB50_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12475,18 +12946,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB46_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12498,12 +12969,12 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB46_1 +; GFX908-NEXT: s_cbranch_execnz .LBB50_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -12512,7 +12983,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -12527,12 +12998,12 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB46_1 +; GFX8-NEXT: s_cbranch_execnz .LBB50_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -12549,7 +13020,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -12574,12 +13045,12 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB46_1 +; GFX7-NEXT: s_cbranch_execnz .LBB50_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -12596,7 +13067,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -12622,18 +13093,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB46_1 +; GFX6-NEXT: s_cbranch_execnz .LBB50_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg: +define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -12643,7 +13114,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 @@ -12659,18 +13130,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB47_1 +; GFX12-NEXT: s_cbranch_execnz .LBB51_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX940-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12684,18 +13155,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB47_1 +; GFX940-NEXT: s_cbranch_execnz .LBB51_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12711,18 +13182,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB47_1 +; GFX11-NEXT: s_cbranch_execnz .LBB51_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12736,18 +13207,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB47_1 +; GFX10-NEXT: s_cbranch_execnz .LBB51_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12759,18 +13230,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB47_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12782,12 +13253,12 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB47_1 +; GFX908-NEXT: s_cbranch_execnz .LBB51_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 @@ -12796,7 +13267,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -12811,12 +13282,12 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB47_1 +; GFX8-NEXT: s_cbranch_execnz .LBB51_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -12837,7 +13308,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -12862,12 +13333,12 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB47_1 +; GFX7-NEXT: s_cbranch_execnz .LBB51_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -12888,7 +13359,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -12914,18 +13385,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB47_1 +; GFX6-NEXT: s_cbranch_execnz .LBB51_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 - %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos: +define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -12935,7 +13406,7 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -12951,19 +13422,19 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB48_1 +; GFX12-NEXT: s_cbranch_execnz .LBB52_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX940-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 @@ -12977,19 +13448,19 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB48_1 +; GFX940-NEXT: s_cbranch_execnz .LBB52_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -13005,19 +13476,19 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB48_1 +; GFX11-NEXT: s_cbranch_execnz .LBB52_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -13031,19 +13502,19 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB48_1 +; GFX10-NEXT: s_cbranch_execnz .LBB52_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -13057,19 +13528,19 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB48_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -13081,13 +13552,13 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB48_1 +; GFX908-NEXT: s_cbranch_execnz .LBB52_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -13096,7 +13567,7 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -13111,12 +13582,12 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB48_1 +; GFX8-NEXT: s_cbranch_execnz .LBB52_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -13133,7 +13604,7 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -13158,14 +13629,14 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB48_1 +; GFX7-NEXT: s_cbranch_execnz .LBB52_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -13182,7 +13653,7 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX6-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -13208,7 +13679,7 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB48_1 +; GFX6-NEXT: s_cbranch_execnz .LBB52_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v2 @@ -13216,12 +13687,12 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val seq_cst + %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } -define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos: +define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -13231,7 +13702,7 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 @@ -13247,18 +13718,18 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB49_1 +; GFX12-NEXT: s_cbranch_execnz .LBB53_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX940-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13272,18 +13743,18 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB49_1 +; GFX940-NEXT: s_cbranch_execnz .LBB53_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13299,18 +13770,18 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB49_1 +; GFX11-NEXT: s_cbranch_execnz .LBB53_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13324,18 +13795,18 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB49_1 +; GFX10-NEXT: s_cbranch_execnz .LBB53_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13349,18 +13820,18 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13372,12 +13843,12 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB49_1 +; GFX908-NEXT: s_cbranch_execnz .LBB53_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -13386,7 +13857,7 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -13401,12 +13872,12 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB49_1 +; GFX8-NEXT: s_cbranch_execnz .LBB53_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -13423,7 +13894,7 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -13448,12 +13919,12 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB49_1 +; GFX7-NEXT: s_cbranch_execnz .LBB53_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -13470,7 +13941,7 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -13496,13 +13967,13 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB49_1 +; GFX6-NEXT: s_cbranch_execnz .LBB53_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -13510,8 +13981,8 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace( ; <2 x bfloat> ; -------------------------------------------------------------------- -define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_v2bf16: +define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -13522,7 +13993,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v3 @@ -13555,13 +14026,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB50_1 +; GFX12-NEXT: s_cbranch_execnz .LBB54_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16: +; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off @@ -13570,7 +14041,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v3 @@ -13597,13 +14068,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB50_1 +; GFX940-NEXT: s_cbranch_execnz .LBB54_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_v2bf16: +; GFX11-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off @@ -13612,7 +14083,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v3 @@ -13645,21 +14116,21 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB50_1 +; GFX11-NEXT: s_cbranch_execnz .LBB54_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_v2bf16: +; GFX10-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -13686,13 +14157,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB50_1 +; GFX10-NEXT: s_cbranch_execnz .LBB54_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2bf16: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off @@ -13701,7 +14172,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -13726,13 +14197,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_v2bf16: +; GFX908-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off @@ -13741,7 +14212,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -13766,20 +14237,20 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB50_1 +; GFX908-NEXT: s_cbranch_execnz .LBB54_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_v2bf16: +; GFX8-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v3 @@ -13807,13 +14278,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB50_1 +; GFX8-NEXT: s_cbranch_execnz .LBB54_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_v2bf16: +; GFX7-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -13829,7 +14300,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -13851,14 +14322,14 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB50_1 +; GFX7-NEXT: s_cbranch_execnz .LBB54_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_v2bf16: +; GFX6-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -13874,7 +14345,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -13897,19 +14368,19 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB50_1 +; GFX6-NEXT: s_cbranch_execnz .LBB54_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } -define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos: +define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -13920,7 +14391,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v3 @@ -13953,13 +14424,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB51_1 +; GFX12-NEXT: s_cbranch_execnz .LBB55_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -13968,7 +14439,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v3 @@ -13995,13 +14466,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB51_1 +; GFX940-NEXT: s_cbranch_execnz .LBB55_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 @@ -14010,7 +14481,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v3 @@ -14043,21 +14514,21 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB51_1 +; GFX11-NEXT: s_cbranch_execnz .LBB55_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -14084,13 +14555,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB51_1 +; GFX10-NEXT: s_cbranch_execnz .LBB55_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -14099,7 +14570,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -14124,13 +14595,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -14139,7 +14610,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -14164,13 +14635,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB51_1 +; GFX908-NEXT: s_cbranch_execnz .LBB55_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -14179,7 +14650,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -14207,12 +14678,12 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB51_1 +; GFX8-NEXT: s_cbranch_execnz .LBB55_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -14228,7 +14699,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -14250,14 +14721,14 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB51_1 +; GFX7-NEXT: s_cbranch_execnz .LBB55_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -14273,7 +14744,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -14296,7 +14767,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB51_1 +; GFX6-NEXT: s_cbranch_execnz .LBB55_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 @@ -14304,12 +14775,12 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } -define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg: +define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -14320,7 +14791,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v3 @@ -14353,13 +14824,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB52_1 +; GFX12-NEXT: s_cbranch_execnz .LBB56_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -14368,7 +14839,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v3 @@ -14395,13 +14866,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB52_1 +; GFX940-NEXT: s_cbranch_execnz .LBB56_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 @@ -14410,7 +14881,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v3 @@ -14443,21 +14914,21 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB52_1 +; GFX11-NEXT: s_cbranch_execnz .LBB56_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -14484,13 +14955,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB52_1 +; GFX10-NEXT: s_cbranch_execnz .LBB56_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -14499,7 +14970,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -14524,13 +14995,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -14539,7 +15010,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -14564,13 +15035,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB52_1 +; GFX908-NEXT: s_cbranch_execnz .LBB56_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 @@ -14579,7 +15050,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -14607,12 +15078,12 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB52_1 +; GFX8-NEXT: s_cbranch_execnz .LBB56_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -14632,7 +15103,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -14654,12 +15125,12 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB52_1 +; GFX7-NEXT: s_cbranch_execnz .LBB56_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -14679,7 +15150,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -14702,18 +15173,18 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB52_1 +; GFX6-NEXT: s_cbranch_execnz .LBB56_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512 - %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } -define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_v2bf16: +define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -14724,7 +15195,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14756,12 +15227,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB53_1 +; GFX12-NEXT: s_cbranch_execnz .LBB57_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16: +; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off @@ -14770,7 +15241,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14797,12 +15268,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB53_1 +; GFX940-NEXT: s_cbranch_execnz .LBB57_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16: +; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off @@ -14811,7 +15282,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14843,20 +15314,20 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB53_1 +; GFX11-NEXT: s_cbranch_execnz .LBB57_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16: +; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14883,12 +15354,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB53_1 +; GFX10-NEXT: s_cbranch_execnz .LBB57_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off @@ -14897,7 +15368,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14922,12 +15393,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16: +; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off @@ -14936,7 +15407,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14961,19 +15432,19 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB53_1 +; GFX908-NEXT: s_cbranch_execnz .LBB57_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16: +; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15001,12 +15472,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB53_1 +; GFX8-NEXT: s_cbranch_execnz .LBB57_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_v2bf16: +; GFX7-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -15022,7 +15493,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -15044,12 +15515,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB53_1 +; GFX7-NEXT: s_cbranch_execnz .LBB57_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_v2bf16: +; GFX6-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -15065,7 +15536,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -15088,17 +15559,17 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB53_1 +; GFX6-NEXT: s_cbranch_execnz .LBB57_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos: +define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -15109,7 +15580,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15141,12 +15612,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB54_1 +; GFX12-NEXT: s_cbranch_execnz .LBB58_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -15155,7 +15626,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15182,12 +15653,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB54_1 +; GFX940-NEXT: s_cbranch_execnz .LBB58_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 @@ -15196,7 +15667,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15228,20 +15699,20 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB54_1 +; GFX11-NEXT: s_cbranch_execnz .LBB58_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15268,12 +15739,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB54_1 +; GFX10-NEXT: s_cbranch_execnz .LBB58_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -15282,7 +15753,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15307,12 +15778,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB58_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -15321,7 +15792,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15346,12 +15817,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB54_1 +; GFX908-NEXT: s_cbranch_execnz .LBB58_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -15360,7 +15831,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15388,12 +15859,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB54_1 +; GFX8-NEXT: s_cbranch_execnz .LBB58_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -15409,7 +15880,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -15431,12 +15902,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB54_1 +; GFX7-NEXT: s_cbranch_execnz .LBB58_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -15452,7 +15923,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -15475,18 +15946,18 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB54_1 +; GFX6-NEXT: s_cbranch_execnz .LBB58_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg: +define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -15497,7 +15968,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15529,12 +16000,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB55_1 +; GFX12-NEXT: s_cbranch_execnz .LBB59_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -15543,7 +16014,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15570,12 +16041,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB55_1 +; GFX940-NEXT: s_cbranch_execnz .LBB59_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 @@ -15584,7 +16055,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15616,20 +16087,20 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB55_1 +; GFX11-NEXT: s_cbranch_execnz .LBB59_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15656,12 +16127,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB55_1 +; GFX10-NEXT: s_cbranch_execnz .LBB59_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -15670,7 +16141,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15695,12 +16166,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB59_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -15709,7 +16180,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15734,12 +16205,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB55_1 +; GFX908-NEXT: s_cbranch_execnz .LBB59_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 @@ -15748,7 +16219,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15776,12 +16247,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB55_1 +; GFX8-NEXT: s_cbranch_execnz .LBB59_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -15801,7 +16272,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -15823,12 +16294,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB55_1 +; GFX7-NEXT: s_cbranch_execnz .LBB59_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -15848,7 +16319,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -15871,18 +16342,18 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB55_1 +; GFX6-NEXT: s_cbranch_execnz .LBB59_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512 - %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos: +define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -15893,7 +16364,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v3 @@ -15926,13 +16397,13 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB56_1 +; GFX12-NEXT: s_cbranch_execnz .LBB60_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -15941,7 +16412,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v3 @@ -15968,13 +16439,13 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB56_1 +; GFX940-NEXT: s_cbranch_execnz .LBB60_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 @@ -15983,7 +16454,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v3 @@ -16016,21 +16487,21 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB56_1 +; GFX11-NEXT: s_cbranch_execnz .LBB60_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -16057,13 +16528,13 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB56_1 +; GFX10-NEXT: s_cbranch_execnz .LBB60_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -16072,7 +16543,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -16099,13 +16570,13 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB60_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -16114,7 +16585,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -16139,13 +16610,13 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB56_1 +; GFX908-NEXT: s_cbranch_execnz .LBB60_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -16154,7 +16625,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -16182,12 +16653,12 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB56_1 +; GFX8-NEXT: s_cbranch_execnz .LBB60_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -16203,7 +16674,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -16225,14 +16696,14 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB56_1 +; GFX7-NEXT: s_cbranch_execnz .LBB60_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -16248,7 +16719,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -16271,7 +16742,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB56_1 +; GFX6-NEXT: s_cbranch_execnz .LBB60_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 @@ -16279,12 +16750,12 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst + %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } -define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos: +define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -16295,7 +16766,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16327,12 +16798,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB57_1 +; GFX12-NEXT: s_cbranch_execnz .LBB61_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -16341,7 +16812,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16368,12 +16839,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB57_1 +; GFX940-NEXT: s_cbranch_execnz .LBB61_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 @@ -16382,7 +16853,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16414,20 +16885,20 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB57_1 +; GFX11-NEXT: s_cbranch_execnz .LBB61_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16454,12 +16925,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB57_1 +; GFX10-NEXT: s_cbranch_execnz .LBB61_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -16468,7 +16939,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16495,12 +16966,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB61_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -16509,7 +16980,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16534,12 +17005,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB57_1 +; GFX908-NEXT: s_cbranch_execnz .LBB61_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -16548,7 +17019,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16576,12 +17047,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB57_1 +; GFX8-NEXT: s_cbranch_execnz .LBB61_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -16597,7 +17068,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -16619,12 +17090,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB57_1 +; GFX7-NEXT: s_cbranch_execnz .LBB61_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -16640,7 +17111,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -16663,15 +17134,17 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB57_1 +; GFX6-NEXT: s_cbranch_execnz .LBB61_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" } -attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } +attributes #0 = { nounwind } +attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } + +!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll index 591e01b11bd24..65df8f07fb8b3 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll @@ -13,8 +13,8 @@ ; float ; -------------------------------------------------------------------- -define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f32: +define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -28,7 +28,7 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f32: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off @@ -53,7 +53,7 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f32: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -63,7 +63,7 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f32: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -73,7 +73,7 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off @@ -97,7 +97,7 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f32: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off @@ -121,7 +121,7 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f32: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] @@ -145,7 +145,7 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f32: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -158,7 +158,7 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f32: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -171,12 +171,12 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos: +define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -190,7 +190,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -215,7 +215,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -225,7 +225,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -235,7 +235,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -259,7 +259,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -283,7 +283,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -308,7 +308,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -321,7 +321,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -335,12 +335,12 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg: +define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -354,7 +354,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -379,7 +379,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -389,7 +389,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -399,7 +399,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -423,7 +423,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -447,7 +447,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 @@ -472,7 +472,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -485,7 +485,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -499,12 +499,12 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 - %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f32: +define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -518,7 +518,7 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f32: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off @@ -542,7 +542,7 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f32: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -552,7 +552,7 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f32: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -562,7 +562,7 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off @@ -585,7 +585,7 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f32: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off @@ -608,7 +608,7 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f32: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] @@ -631,7 +631,7 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f32: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -643,7 +643,7 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f32: +; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -655,12 +655,12 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos: +define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -674,7 +674,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -698,7 +698,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -708,7 +708,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -718,7 +718,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -741,7 +741,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -764,7 +764,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -789,7 +789,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -801,7 +801,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -814,12 +814,12 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg: +define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -833,7 +833,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -857,7 +857,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -867,7 +867,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -877,7 +877,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -900,7 +900,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -923,7 +923,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 @@ -948,7 +948,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -960,7 +960,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -973,44 +973,26 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 - %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define float @global_system_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos: +define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4 -; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB6_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1035,61 +1017,27 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX11-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc +; GFX11-NEXT: global_atomic_min_f32 v0, v[0:1], v2, off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB6_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX10-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB6_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1115,7 +1063,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1139,7 +1087,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -1164,105 +1112,53 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v5 -; GFX7-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB6_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v5 -; GFX6-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB6_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst + %result = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -define void @global_system_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos: +define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB7_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1286,59 +1182,27 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_min_f32 v[0:1], v2, off offset:2044 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB7_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX10-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_atomic_fmin v[0:1], v2, off offset:2044 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB7_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1363,7 +1227,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1386,7 +1250,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -1411,76 +1275,37 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX7-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX7-NEXT: v_mov_b32_e32 v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v5, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB7_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX6-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v5, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB7_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -; -------------------------------------------------------------------- -; float with ftz/daz -; -------------------------------------------------------------------- - -define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__ftz: +define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1494,7 +1319,7 @@ define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__ftz: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off @@ -1519,27 +1344,61 @@ define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__ftz: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX11-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_min_f32 v0, v[0:1], v2, off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__ftz: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX10-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB8_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__ftz: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off @@ -1563,7 +1422,7 @@ define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__ftz: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off @@ -1587,7 +1446,7 @@ define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__ftz: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] @@ -1611,38 +1470,73 @@ define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__ftz: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v5 +; GFX7-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__ftz: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v5 +; GFX6-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB8_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret float %result } -define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz: +define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1651,15 +1545,15 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start @@ -1669,7 +1563,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 ; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 @@ -1681,30 +1575,30 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_min_f32 v0, v[0:1], v2, off offset:2044 glc +; GFX11-NEXT: global_atomic_min_f32 v0, v[0:1], v2, off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off offset:2044 glc +; GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start @@ -1713,7 +1607,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 ; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 @@ -1725,10 +1619,10 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start @@ -1737,7 +1631,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 ; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 @@ -1749,64 +1643,66 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX8-NEXT: v_min_f32_e32 v5, v0, v1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; GFX8-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB9_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret float %result } -define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz: +; -------------------------------------------------------------------- +; float with ftz/daz +; -------------------------------------------------------------------- + +define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1815,15 +1711,15 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start @@ -1833,7 +1729,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 ; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 @@ -1845,30 +1741,30 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_min_f32 v0, v[0:1], v2, off offset:-2048 glc +; GFX11-NEXT: global_atomic_min_f32 v0, v[0:1], v2, off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off offset:-2048 glc +; GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start @@ -1877,7 +1773,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 ; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 @@ -1889,10 +1785,10 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start @@ -1901,7 +1797,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 ; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 @@ -1913,64 +1809,62 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX8-NEXT: v_min_f32_e32 v5, v0, v1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; GFX8-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s5, -1 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s5, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 - %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -define void @global_agent_atomic_fmin_noret_f32__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__ftz: +define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1979,154 +1873,162 @@ define void @global_agent_atomic_fmin_noret_f32__ftz(ptr addrspace(1) %ptr, floa ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off scope:SCOPE_DEV -; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__ftz: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB11_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__ftz: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_min_f32 v[0:1], v2, off -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_min_f32 v0, v[0:1], v2, off offset:2044 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__ftz: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin v[0:1], v2, off -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off offset:2044 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__ftz: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__ftz: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__ftz: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 +; GFX8-NEXT: v_min_f32_e32 v5, v0, v1 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__ftz: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__ftz: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst - ret void + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 + %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret float %result } -define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz: +define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -2135,157 +2037,162 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspac ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV -; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB12_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_min_f32 v[0:1], v2, off offset:2044 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_min_f32 v0, v[0:1], v2, off offset:-2048 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin v[0:1], v2, off offset:2044 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off offset:-2048 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB12_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 +; GFX8-NEXT: v_min_f32_e32 v5, v0, v1 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB12_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_movk_i32 s4, 0xf800 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s5, -1 +; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_movk_i32 s4, 0xf800 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s5, -1 +; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst - ret void + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 + %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret float %result } -define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz: +define void @global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -2294,15 +2201,15 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start @@ -2311,7 +2218,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 @@ -2323,30 +2230,30 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_min_f32 v[0:1], v2, off offset:-2048 +; GFX11-NEXT: global_atomic_min_f32 v[0:1], v2, off ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin v[0:1], v2, off offset:-2048 +; GFX10-NEXT: global_atomic_fmin v[0:1], v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start @@ -2354,7 +2261,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 @@ -2366,10 +2273,10 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start @@ -2377,7 +2284,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 @@ -2389,11 +2296,9 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 @@ -2414,324 +2319,212 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s5, -1 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz: +; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s5, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 - %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz: +define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4 -; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2 -; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB14_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB14_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX11-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_min_f32 v[0:1], v2, off offset:2044 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB14_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX10-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_atomic_fmin v[0:1], v2, off offset:2044 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB14_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 ; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX8-NEXT: v_min_f32_e32 v5, v0, v1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v5 -; GFX7-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB14_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz: +; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v5 -; GFX6-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB14_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst - ret float %result + %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void } -define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz: +define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 -; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB15_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB15_1: ; %atomicrmw.start @@ -2739,10 +2532,10 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 @@ -2752,62 +2545,30 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_min_f32 v[0:1], v2, off offset:-2048 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB15_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX10-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_atomic_fmin v[0:1], v2, off offset:-2048 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB15_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start @@ -2815,10 +2576,8 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] @@ -2829,10 +2588,10 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start @@ -2840,7 +2599,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 @@ -2852,11 +2611,11 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 @@ -2877,86 +2636,723 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_movk_i32 s4, 0xf800 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s5, -1 +; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_movk_i32 s4, 0xf800 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s5, -1 +; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 + %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB16_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_min_f32 v0, v[0:1], v2, off offset:2044 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off offset:2044 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB16_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 +; GFX8-NEXT: v_min_f32_e32 v5, v0, v1 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB16_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX7-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX7-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX7-NEXT: v_mov_b32_e32 v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v5, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB15_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz: +; GFX6-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX6-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX6-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v5, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB15_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 + %result = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 + ret float %result +} + +define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB17_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_min_f32 v[0:1], v2, off offset:2044 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_fmin v[0:1], v2, off offset:2044 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB17_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 +; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB17_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 + %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +; -------------------------------------------------------------------- +; double +; -------------------------------------------------------------------- + +define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB18_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB18_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB18_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB18_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret double %result +} + +define double @global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB19_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:2040 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB19_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off offset:2040 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:2040 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB19_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB19_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst - ret void + %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255 + %result = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret double %result } -; -------------------------------------------------------------------- -; double -; -------------------------------------------------------------------- - -define double @global_agent_atomic_fmin_ret_f64(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f64: +define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -2965,35 +3361,35 @@ define double @global_agent_atomic_fmin_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB16_1 +; GFX12-NEXT: s_cbranch_execnz .LBB20_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f64: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off sc0 +; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f64: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 ; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -3001,7 +3397,7 @@ define double @global_agent_atomic_fmin_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] ; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -3009,89 +3405,89 @@ define double @global_agent_atomic_fmin_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB16_1 +; GFX11-NEXT: s_cbranch_execnz .LBB20_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f64: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc +; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f64: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 ; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] ; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB16_1 +; GFX908-NEXT: s_cbranch_execnz .LBB20_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f64: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v5 -; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB16_1 +; GFX8-NEXT: s_cbranch_execnz .LBB20_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f64: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_movk_i32 s4, 0xf800 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 @@ -3099,13 +3495,13 @@ define double @global_agent_atomic_fmin_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f64: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_movk_i32 s4, 0xf800 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 @@ -3113,368 +3509,350 @@ define double @global_agent_atomic_fmin_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst + %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256 + %result = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret double %result } -define double @global_agent_atomic_fmin_ret_f64__offset12b_pos(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos: +define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB17_1 +; GFX12-NEXT: s_cbranch_execnz .LBB21_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:2040 sc0 +; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB17_1 +; GFX11-NEXT: s_cbranch_execnz .LBB21_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off offset:2040 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[2:3], off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:2040 glc +; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v5, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB17_1 +; GFX908-NEXT: s_cbranch_execnz .LBB21_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB17_1 +; GFX8-NEXT: s_cbranch_execnz .LBB21_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc +; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc +; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255 - %result = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst - ret double %result + %unused = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void } -define double @global_agent_atomic_fmin_ret_f64__offset12b_neg(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg: +define void @global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB18_1 +; GFX12-NEXT: s_cbranch_execnz .LBB22_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off offset:2040 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 +; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB18_1 +; GFX11-NEXT: s_cbranch_execnz .LBB22_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off offset:-2048 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[2:3], off offset:2040 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off offset:2040 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v5, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB18_1 +; GFX908-NEXT: s_cbranch_execnz .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB18_1 +; GFX8-NEXT: s_cbranch_execnz .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s5, -1 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s5, -1 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256 - %result = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst - ret double %result + %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255 + %unused = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void } -define void @global_agent_atomic_fmin_noret_f64(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f64: +define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 ; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] @@ -3482,7 +3860,7 @@ define void @global_agent_atomic_fmin_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] @@ -3490,34 +3868,34 @@ define void @global_agent_atomic_fmin_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB19_1 +; GFX12-NEXT: s_cbranch_execnz .LBB23_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f64: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off +; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off offset:-2048 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f64: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 ; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -3526,41 +3904,41 @@ define void @global_agent_atomic_fmin_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB19_1 +; GFX11-NEXT: s_cbranch_execnz .LBB23_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f64: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[2:3], off +; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[2:3], off offset:-2048 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off +; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off offset:-2048 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f64: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 ; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc +; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] @@ -3568,18 +3946,20 @@ define void @global_agent_atomic_fmin_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB19_1 +; GFX908-NEXT: s_cbranch_execnz .LBB23_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f64: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -3592,375 +3972,466 @@ define void @global_agent_atomic_fmin_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB19_1 +; GFX8-NEXT: s_cbranch_execnz .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f64: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_movk_i32 s4, 0xf800 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f64: +; GFX6-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_movk_i32 s4, 0xf800 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst + %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256 + %unused = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmin_noret_f64__offset12b_pos(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos: +define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB20_1 +; GFX12-NEXT: s_cbranch_execnz .LBB24_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off offset:2040 +; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB20_1 +; GFX11-NEXT: s_cbranch_execnz .LBB24_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v7, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX10-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[2:3], off offset:2040 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB24_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off offset:2040 +; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX90A-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB20_1 +; GFX908-NEXT: s_cbranch_execnz .LBB24_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB20_1 +; GFX8-NEXT: s_cbranch_execnz .LBB24_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-NEXT: v_mov_b32_e32 v4, v0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 +; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v11, v1 +; GFX7-NEXT: v_mov_b32_e32 v10, v0 +; GFX7-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11] +; GFX7-NEXT: v_min_f64 v[8:9], v[0:1], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v0, v8 +; GFX7-NEXT: v_mov_b32_e32 v1, v9 +; GFX7-NEXT: v_mov_b32_e32 v2, v10 +; GFX7-NEXT: v_mov_b32_e32 v3, v11 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB24_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_mov_b32_e32 v5, v1 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 +; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 +; GFX6-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v11, v1 +; GFX6-NEXT: v_mov_b32_e32 v10, v0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11] +; GFX6-NEXT: v_min_f64 v[8:9], v[0:1], v[6:7] +; GFX6-NEXT: v_mov_b32_e32 v0, v8 +; GFX6-NEXT: v_mov_b32_e32 v1, v9 +; GFX6-NEXT: v_mov_b32_e32 v2, v10 +; GFX6-NEXT: v_mov_b32_e32 v3, v11 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB24_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255 - %unused = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst - ret void + %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 + ret double %result } -define void @global_agent_atomic_fmin_noret_f64__offset12b_neg(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg: +define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB21_1 +; GFX12-NEXT: s_cbranch_execnz .LBB25_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off offset:-2048 +; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB21_1 +; GFX11-NEXT: s_cbranch_execnz .LBB25_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[2:3], off offset:-2048 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off offset:-2048 +; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB21_1 +; GFX908-NEXT: s_cbranch_execnz .LBB25_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB21_1 +; GFX8-NEXT: s_cbranch_execnz .LBB25_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s5, -1 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s5, -1 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256 - %unused = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst - ret void + %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 + ret double %result } ; -------------------------------------------------------------------- ; half ; -------------------------------------------------------------------- -define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f16: +define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -3978,7 +4449,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -4000,13 +4471,13 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB22_1 +; GFX12-NEXT: s_cbranch_execnz .LBB26_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f16: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v0 @@ -4019,7 +4490,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -4035,13 +4506,13 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB22_1 +; GFX940-NEXT: s_cbranch_execnz .LBB26_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f16: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v0 @@ -4055,7 +4526,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -4077,13 +4548,13 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB22_1 +; GFX11-NEXT: s_cbranch_execnz .LBB26_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f16: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 @@ -4095,7 +4566,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -4112,13 +4583,13 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB22_1 +; GFX10-NEXT: s_cbranch_execnz .LBB26_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 @@ -4131,7 +4602,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -4146,13 +4617,13 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f16: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v0 @@ -4165,7 +4636,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -4180,13 +4651,13 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB22_1 +; GFX908-NEXT: s_cbranch_execnz .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f16: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v0 @@ -4199,7 +4670,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -4215,13 +4686,13 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB22_1 +; GFX8-NEXT: s_cbranch_execnz .LBB26_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f16: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4238,7 +4709,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 ; GFX7-NEXT: v_not_b32_e32 v7, v2 -; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4 @@ -4257,14 +4728,14 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB22_1 +; GFX7-NEXT: s_cbranch_execnz .LBB26_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f16: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v3, v0 @@ -4281,7 +4752,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 ; GFX6-NEXT: v_not_b32_e32 v7, v2 -; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4 @@ -4300,19 +4771,19 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB22_1 +; GFX6-NEXT: s_cbranch_execnz .LBB26_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmin ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result } -define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos: +define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -4331,7 +4802,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -4353,13 +4824,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB23_1 +; GFX12-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -4374,7 +4845,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -4390,13 +4861,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB23_1 +; GFX940-NEXT: s_cbranch_execnz .LBB27_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -4411,7 +4882,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -4433,13 +4904,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB23_1 +; GFX11-NEXT: s_cbranch_execnz .LBB27_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -4452,7 +4923,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -4469,13 +4940,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB23_1 +; GFX10-NEXT: s_cbranch_execnz .LBB27_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -4489,7 +4960,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -4504,13 +4975,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -4524,7 +4995,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -4539,13 +5010,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB23_1 +; GFX908-NEXT: s_cbranch_execnz .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 @@ -4559,7 +5030,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -4575,13 +5046,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB23_1 +; GFX8-NEXT: s_cbranch_execnz .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -4599,7 +5070,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 -; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -4618,14 +5089,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB23_1 +; GFX7-NEXT: s_cbranch_execnz .LBB27_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -4643,7 +5114,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 -; GFX6-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -4663,7 +5134,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB23_1 +; GFX6-NEXT: s_cbranch_execnz .LBB27_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 @@ -4671,12 +5142,12 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result } -define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg: +define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -4695,7 +5166,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -4717,13 +5188,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB24_1 +; GFX12-NEXT: s_cbranch_execnz .LBB28_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_movk_i32 s0, 0xf800 @@ -4739,7 +5210,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -4755,13 +5226,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB24_1 +; GFX940-NEXT: s_cbranch_execnz .LBB28_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 @@ -4776,7 +5247,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -4798,13 +5269,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB24_1 +; GFX11-NEXT: s_cbranch_execnz .LBB28_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 @@ -4817,7 +5288,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -4834,13 +5305,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB24_1 +; GFX10-NEXT: s_cbranch_execnz .LBB28_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 @@ -4854,7 +5325,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -4869,13 +5340,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 @@ -4889,7 +5360,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -4904,13 +5375,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB24_1 +; GFX908-NEXT: s_cbranch_execnz .LBB28_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 @@ -4924,7 +5395,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -4940,13 +5411,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB24_1 +; GFX8-NEXT: s_cbranch_execnz .LBB28_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -4964,7 +5435,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 -; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -4983,14 +5454,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB24_1 +; GFX7-NEXT: s_cbranch_execnz .LBB28_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -5008,7 +5479,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 -; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -5028,7 +5499,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB24_1 +; GFX6-NEXT: s_cbranch_execnz .LBB28_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 @@ -5036,12 +5507,12 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024 - %result = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result } -define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f16: +define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -5059,7 +5530,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -5081,12 +5552,12 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB25_1 +; GFX12-NEXT: s_cbranch_execnz .LBB29_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f16: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v0 @@ -5099,7 +5570,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX940-NEXT: v_not_b32_e32 v6, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -5115,12 +5586,12 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB25_1 +; GFX940-NEXT: s_cbranch_execnz .LBB29_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f16: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v0 @@ -5134,7 +5605,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -5156,12 +5627,12 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB25_1 +; GFX11-NEXT: s_cbranch_execnz .LBB29_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f16: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 @@ -5173,7 +5644,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -5190,12 +5661,12 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB25_1 +; GFX10-NEXT: s_cbranch_execnz .LBB29_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 @@ -5208,7 +5679,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -5223,12 +5694,12 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB25_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB29_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f16: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v0 @@ -5241,7 +5712,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -5256,12 +5727,12 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB25_1 +; GFX908-NEXT: s_cbranch_execnz .LBB29_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f16: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v0 @@ -5274,7 +5745,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -5290,12 +5761,12 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB25_1 +; GFX8-NEXT: s_cbranch_execnz .LBB29_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f16: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5312,7 +5783,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 @@ -5331,12 +5802,12 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB25_1 +; GFX7-NEXT: s_cbranch_execnz .LBB29_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f16: +; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v3, v0 @@ -5353,7 +5824,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX6-NEXT: v_not_b32_e32 v6, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v4 @@ -5373,17 +5844,17 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB25_1 +; GFX6-NEXT: s_cbranch_execnz .LBB29_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmin ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos: +define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -5402,7 +5873,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5424,12 +5895,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB26_1 +; GFX12-NEXT: s_cbranch_execnz .LBB30_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -5444,7 +5915,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: v_not_b32_e32 v5, v5 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5460,12 +5931,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB26_1 +; GFX940-NEXT: s_cbranch_execnz .LBB30_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -5480,7 +5951,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5502,12 +5973,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB26_1 +; GFX11-NEXT: s_cbranch_execnz .LBB30_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -5520,7 +5991,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5537,12 +6008,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB26_1 +; GFX10-NEXT: s_cbranch_execnz .LBB30_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -5556,7 +6027,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5571,12 +6042,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -5590,7 +6061,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5605,12 +6076,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB26_1 +; GFX908-NEXT: s_cbranch_execnz .LBB30_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 @@ -5624,7 +6095,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5640,12 +6111,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB26_1 +; GFX8-NEXT: s_cbranch_execnz .LBB30_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -5663,7 +6134,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5682,12 +6153,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB26_1 +; GFX7-NEXT: s_cbranch_execnz .LBB30_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -5705,7 +6176,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 -; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5725,18 +6196,18 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB26_1 +; GFX6-NEXT: s_cbranch_execnz .LBB30_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg: +define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -5755,7 +6226,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5777,12 +6248,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB27_1 +; GFX12-NEXT: s_cbranch_execnz .LBB31_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_movk_i32 s0, 0xf800 @@ -5798,7 +6269,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: v_not_b32_e32 v5, v5 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5814,12 +6285,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB27_1 +; GFX940-NEXT: s_cbranch_execnz .LBB31_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 @@ -5834,7 +6305,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5856,12 +6327,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-NEXT: s_cbranch_execnz .LBB31_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 @@ -5874,7 +6345,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5891,12 +6362,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB27_1 +; GFX10-NEXT: s_cbranch_execnz .LBB31_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 @@ -5910,7 +6381,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5925,12 +6396,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 @@ -5944,7 +6415,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5959,12 +6430,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB27_1 +; GFX908-NEXT: s_cbranch_execnz .LBB31_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 @@ -5978,7 +6449,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5994,12 +6465,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB27_1 +; GFX8-NEXT: s_cbranch_execnz .LBB31_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -6017,7 +6488,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6036,12 +6507,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB27_1 +; GFX7-NEXT: s_cbranch_execnz .LBB31_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -6059,7 +6530,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 -; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6079,18 +6550,18 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB27_1 +; GFX6-NEXT: s_cbranch_execnz .LBB31_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024 - %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4: +define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -6100,7 +6571,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -6119,20 +6590,20 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB28_1 +; GFX12-NEXT: s_cbranch_execnz .LBB32_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX940-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 @@ -6146,19 +6617,19 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB28_1 +; GFX940-NEXT: s_cbranch_execnz .LBB32_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -6177,19 +6648,19 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB28_1 +; GFX11-NEXT: s_cbranch_execnz .LBB32_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -6205,20 +6676,20 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB28_1 +; GFX10-NEXT: s_cbranch_execnz .LBB32_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -6231,20 +6702,20 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -6257,13 +6728,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB28_1 +; GFX908-NEXT: s_cbranch_execnz .LBB32_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 @@ -6271,7 +6742,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v1, v2, v2 -; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -6285,12 +6756,12 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB28_1 +; GFX8-NEXT: s_cbranch_execnz .LBB32_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -6301,7 +6772,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -6318,13 +6789,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB28_1 +; GFX7-NEXT: s_cbranch_execnz .LBB32_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -6335,7 +6806,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX6-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -6353,19 +6824,19 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB28_1 +; GFX6-NEXT: s_cbranch_execnz .LBB32_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4 + %result = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret half %result } -define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos: +define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -6375,7 +6846,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-NEXT: v_max_num_f16_e32 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f16_e32 v2, v3, v3 @@ -6394,19 +6865,19 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB29_1 +; GFX12-NEXT: s_cbranch_execnz .LBB33_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_max_f16_e32 v2, v3, v3 @@ -6420,18 +6891,18 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB29_1 +; GFX940-NEXT: s_cbranch_execnz .LBB33_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f16_e32 v2, v3, v3 @@ -6450,18 +6921,18 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB29_1 +; GFX11-NEXT: s_cbranch_execnz .LBB33_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX10-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_max_f16_e32 v2, v3, v3 @@ -6477,19 +6948,19 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB29_1 +; GFX10-NEXT: s_cbranch_execnz .LBB33_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f16_e32 v2, v3, v3 @@ -6502,19 +6973,19 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB29_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f16_e32 v2, v3, v3 @@ -6527,12 +6998,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB29_1 +; GFX908-NEXT: s_cbranch_execnz .LBB33_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 @@ -6540,7 +7011,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v4, v2, v2 -; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_e32 v2, v3, v3 @@ -6554,12 +7025,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB29_1 +; GFX8-NEXT: s_cbranch_execnz .LBB33_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -6570,7 +7041,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -6587,12 +7058,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB29_1 +; GFX7-NEXT: s_cbranch_execnz .LBB33_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos: +; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -6603,7 +7074,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -6621,18 +7092,18 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB29_1 +; GFX6-NEXT: s_cbranch_execnz .LBB33_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4 + %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret void } -define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos: +define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -6651,7 +7122,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -6673,13 +7144,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB30_1 +; GFX12-NEXT: s_cbranch_execnz .LBB34_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -6694,7 +7165,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -6710,13 +7181,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB30_1 +; GFX940-NEXT: s_cbranch_execnz .LBB34_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -6731,7 +7202,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -6753,13 +7224,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB30_1 +; GFX11-NEXT: s_cbranch_execnz .LBB34_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -6772,7 +7243,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -6789,13 +7260,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB30_1 +; GFX10-NEXT: s_cbranch_execnz .LBB34_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -6809,7 +7280,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -6826,13 +7297,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -6846,7 +7317,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -6861,13 +7332,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB30_1 +; GFX908-NEXT: s_cbranch_execnz .LBB34_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 @@ -6881,7 +7352,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -6897,13 +7368,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB30_1 +; GFX8-NEXT: s_cbranch_execnz .LBB34_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -6921,7 +7392,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 -; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -6940,14 +7411,14 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB30_1 +; GFX7-NEXT: s_cbranch_execnz .LBB34_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -6965,7 +7436,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 -; GFX6-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -6985,7 +7456,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB30_1 +; GFX6-NEXT: s_cbranch_execnz .LBB34_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 @@ -6993,12 +7464,12 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fmin ptr addrspace(1) %gep, half %val seq_cst + %result = atomicrmw fmin ptr addrspace(1) %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result } -define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos: +define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -7017,7 +7488,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7039,12 +7510,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB31_1 +; GFX12-NEXT: s_cbranch_execnz .LBB35_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -7059,7 +7530,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: v_not_b32_e32 v5, v5 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7075,12 +7546,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB31_1 +; GFX940-NEXT: s_cbranch_execnz .LBB35_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -7095,7 +7566,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7117,12 +7588,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB31_1 +; GFX11-NEXT: s_cbranch_execnz .LBB35_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -7135,7 +7606,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7152,12 +7623,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB31_1 +; GFX10-NEXT: s_cbranch_execnz .LBB35_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -7171,7 +7642,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7188,12 +7659,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB35_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -7207,7 +7678,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7222,12 +7693,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB31_1 +; GFX908-NEXT: s_cbranch_execnz .LBB35_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 @@ -7241,7 +7712,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7257,12 +7728,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB31_1 +; GFX8-NEXT: s_cbranch_execnz .LBB35_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -7280,7 +7751,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7299,12 +7770,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB31_1 +; GFX7-NEXT: s_cbranch_execnz .LBB35_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -7322,7 +7793,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 -; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7342,13 +7813,13 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB31_1 +; GFX6-NEXT: s_cbranch_execnz .LBB35_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -7356,8 +7827,8 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; bfloat ; -------------------------------------------------------------------- -define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16: +define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -7374,7 +7845,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -7403,13 +7874,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB32_1 +; GFX12-NEXT: s_cbranch_execnz .LBB36_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16: +; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v0 @@ -7423,7 +7894,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -7445,13 +7916,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB32_1 +; GFX940-NEXT: s_cbranch_execnz .LBB36_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16: +; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 @@ -7465,7 +7936,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -7494,13 +7965,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB32_1 +; GFX11-NEXT: s_cbranch_execnz .LBB36_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16: +; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 @@ -7512,7 +7983,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -7533,13 +8004,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB32_1 +; GFX10-NEXT: s_cbranch_execnz .LBB36_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 @@ -7553,7 +8024,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -7572,13 +8043,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB36_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16: +; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v0 @@ -7592,7 +8063,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -7611,13 +8082,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB32_1 +; GFX908-NEXT: s_cbranch_execnz .LBB36_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16: +; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v0 @@ -7630,7 +8101,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -7651,13 +8122,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB32_1 +; GFX8-NEXT: s_cbranch_execnz .LBB36_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16: +; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -7674,7 +8145,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4 @@ -7694,14 +8165,14 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB32_1 +; GFX7-NEXT: s_cbranch_execnz .LBB36_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16: +; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v3, v0 @@ -7718,7 +8189,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX6-NEXT: v_not_b32_e32 v6, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4 @@ -7738,19 +8209,19 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB32_1 +; GFX6-NEXT: s_cbranch_execnz .LBB36_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmin ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } -define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos: +define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -7769,7 +8240,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -7798,13 +8269,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB33_1 +; GFX12-NEXT: s_cbranch_execnz .LBB37_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -7820,7 +8291,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -7842,13 +8313,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB33_1 +; GFX940-NEXT: s_cbranch_execnz .LBB37_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -7864,7 +8335,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -7893,13 +8364,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB33_1 +; GFX11-NEXT: s_cbranch_execnz .LBB37_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -7912,7 +8383,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -7933,13 +8404,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB33_1 +; GFX10-NEXT: s_cbranch_execnz .LBB37_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -7954,7 +8425,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -7973,13 +8444,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB37_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -7994,7 +8465,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -8013,13 +8484,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB33_1 +; GFX908-NEXT: s_cbranch_execnz .LBB37_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 @@ -8033,7 +8504,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -8054,13 +8525,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB33_1 +; GFX8-NEXT: s_cbranch_execnz .LBB37_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -8078,7 +8549,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: v_not_b32_e32 v7, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -8098,14 +8569,14 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB33_1 +; GFX7-NEXT: s_cbranch_execnz .LBB37_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -8123,7 +8594,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: v_not_b32_e32 v7, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -8144,7 +8615,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB33_1 +; GFX6-NEXT: s_cbranch_execnz .LBB37_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 @@ -8152,12 +8623,12 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } -define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg: +define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -8176,7 +8647,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -8205,13 +8676,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB34_1 +; GFX12-NEXT: s_cbranch_execnz .LBB38_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_movk_i32 s0, 0xf800 @@ -8228,7 +8699,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -8250,13 +8721,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB34_1 +; GFX940-NEXT: s_cbranch_execnz .LBB38_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 @@ -8272,7 +8743,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -8301,13 +8772,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB34_1 +; GFX11-NEXT: s_cbranch_execnz .LBB38_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 @@ -8320,7 +8791,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -8341,13 +8812,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB34_1 +; GFX10-NEXT: s_cbranch_execnz .LBB38_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 @@ -8362,7 +8833,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -8381,13 +8852,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB38_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 @@ -8402,7 +8873,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -8421,13 +8892,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB34_1 +; GFX908-NEXT: s_cbranch_execnz .LBB38_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 @@ -8441,7 +8912,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -8462,13 +8933,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB34_1 +; GFX8-NEXT: s_cbranch_execnz .LBB38_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -8486,7 +8957,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: v_not_b32_e32 v7, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -8506,14 +8977,14 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB34_1 +; GFX7-NEXT: s_cbranch_execnz .LBB38_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -8531,7 +9002,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: v_not_b32_e32 v7, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX6-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -8552,7 +9023,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB34_1 +; GFX6-NEXT: s_cbranch_execnz .LBB38_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 @@ -8560,12 +9031,12 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024 - %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } -define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16: +define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -8582,7 +9053,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -8610,12 +9081,12 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB35_1 +; GFX12-NEXT: s_cbranch_execnz .LBB39_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16: +; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v0 @@ -8629,7 +9100,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -8651,12 +9122,12 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB35_1 +; GFX940-NEXT: s_cbranch_execnz .LBB39_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16: +; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 @@ -8670,7 +9141,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v6, v3 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -8698,12 +9169,12 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB35_1 +; GFX11-NEXT: s_cbranch_execnz .LBB39_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16: +; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 @@ -8715,7 +9186,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -8736,12 +9207,12 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB35_1 +; GFX10-NEXT: s_cbranch_execnz .LBB39_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 @@ -8755,7 +9226,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -8774,12 +9245,12 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB35_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB39_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16: +; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v0 @@ -8793,7 +9264,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -8812,12 +9283,12 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB35_1 +; GFX908-NEXT: s_cbranch_execnz .LBB39_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16: +; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v0 @@ -8830,7 +9301,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -8851,12 +9322,12 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB35_1 +; GFX8-NEXT: s_cbranch_execnz .LBB39_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16: +; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -8873,7 +9344,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -8893,12 +9364,12 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB35_1 +; GFX7-NEXT: s_cbranch_execnz .LBB39_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16: +; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v3, v0 @@ -8915,7 +9386,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX6-NEXT: v_not_b32_e32 v6, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX6-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -8936,17 +9407,17 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB35_1 +; GFX6-NEXT: s_cbranch_execnz .LBB39_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmin ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos: +define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -8965,7 +9436,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -8993,12 +9464,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB36_1 +; GFX12-NEXT: s_cbranch_execnz .LBB40_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -9014,7 +9485,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9036,12 +9507,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB36_1 +; GFX940-NEXT: s_cbranch_execnz .LBB40_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -9057,7 +9528,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v5, v5 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9085,12 +9556,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB36_1 +; GFX11-NEXT: s_cbranch_execnz .LBB40_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -9103,7 +9574,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9124,12 +9595,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB36_1 +; GFX10-NEXT: s_cbranch_execnz .LBB40_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -9144,7 +9615,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9163,12 +9634,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB36_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB40_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -9183,7 +9654,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9202,12 +9673,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB36_1 +; GFX908-NEXT: s_cbranch_execnz .LBB40_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 @@ -9221,7 +9692,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9242,12 +9713,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB36_1 +; GFX8-NEXT: s_cbranch_execnz .LBB40_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -9265,7 +9736,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9285,12 +9756,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB36_1 +; GFX7-NEXT: s_cbranch_execnz .LBB40_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -9308,7 +9779,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: v_not_b32_e32 v5, v5 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9329,18 +9800,18 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB36_1 +; GFX6-NEXT: s_cbranch_execnz .LBB40_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg: +define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -9359,7 +9830,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9387,12 +9858,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB37_1 +; GFX12-NEXT: s_cbranch_execnz .LBB41_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_movk_i32 s0, 0xf800 @@ -9409,7 +9880,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9431,12 +9902,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB37_1 +; GFX940-NEXT: s_cbranch_execnz .LBB41_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 @@ -9452,7 +9923,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v5, v5 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9480,12 +9951,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB37_1 +; GFX11-NEXT: s_cbranch_execnz .LBB41_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 @@ -9498,7 +9969,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9519,12 +9990,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB37_1 +; GFX10-NEXT: s_cbranch_execnz .LBB41_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 @@ -9539,7 +10010,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9558,12 +10029,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB37_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB41_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 @@ -9578,7 +10049,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9597,12 +10068,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB37_1 +; GFX908-NEXT: s_cbranch_execnz .LBB41_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 @@ -9616,7 +10087,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9637,12 +10108,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB37_1 +; GFX8-NEXT: s_cbranch_execnz .LBB41_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -9660,7 +10131,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9680,12 +10151,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB37_1 +; GFX7-NEXT: s_cbranch_execnz .LBB41_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -9703,7 +10174,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: v_not_b32_e32 v5, v5 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9724,18 +10195,18 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB37_1 +; GFX6-NEXT: s_cbranch_execnz .LBB41_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024 - %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4: +define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -9745,7 +10216,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -9771,13 +10242,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB38_1 +; GFX12-NEXT: s_cbranch_execnz .LBB42_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4: +; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 @@ -9785,7 +10256,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff ; GFX940-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX940-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 @@ -9806,20 +10277,20 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB38_1 +; GFX940-NEXT: s_cbranch_execnz .LBB42_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4: +; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -9845,19 +10316,19 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB38_1 +; GFX11-NEXT: s_cbranch_execnz .LBB42_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4: +; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -9878,13 +10349,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB38_1 +; GFX10-NEXT: s_cbranch_execnz .LBB42_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 @@ -9892,7 +10363,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -9911,13 +10382,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB38_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB42_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4: +; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 @@ -9925,7 +10396,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX908-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -9944,13 +10415,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB38_1 +; GFX908-NEXT: s_cbranch_execnz .LBB42_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4: +; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 @@ -9958,7 +10429,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -9978,12 +10449,12 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB38_1 +; GFX8-NEXT: s_cbranch_execnz .LBB42_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4: +; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -9994,7 +10465,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10012,13 +10483,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB38_1 +; GFX7-NEXT: s_cbranch_execnz .LBB42_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4: +; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -10029,7 +10500,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10048,19 +10519,19 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB38_1 +; GFX6-NEXT: s_cbranch_execnz .LBB42_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4 + %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } -define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos: +define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -10070,7 +10541,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10095,12 +10566,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB39_1 +; GFX12-NEXT: s_cbranch_execnz .LBB43_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos: +; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 @@ -10108,7 +10579,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff ; GFX940-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX940-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10129,19 +10600,19 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB39_1 +; GFX940-NEXT: s_cbranch_execnz .LBB43_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos: +; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10166,18 +10637,18 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB39_1 +; GFX11-NEXT: s_cbranch_execnz .LBB43_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos: +; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10198,12 +10669,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB39_1 +; GFX10-NEXT: s_cbranch_execnz .LBB43_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 @@ -10211,7 +10682,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX90A-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10230,12 +10701,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB39_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB43_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos: +; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 @@ -10243,7 +10714,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX908-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10262,12 +10733,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB39_1 +; GFX908-NEXT: s_cbranch_execnz .LBB43_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos: +; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 @@ -10275,7 +10746,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10295,12 +10766,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB39_1 +; GFX8-NEXT: s_cbranch_execnz .LBB43_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos: +; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -10311,7 +10782,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10329,12 +10800,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB39_1 +; GFX7-NEXT: s_cbranch_execnz .LBB43_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos: +; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -10345,7 +10816,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX6-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10364,18 +10835,18 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB39_1 +; GFX6-NEXT: s_cbranch_execnz .LBB43_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4 + %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret void } -define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos: +define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -10394,7 +10865,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -10423,13 +10894,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB40_1 +; GFX12-NEXT: s_cbranch_execnz .LBB44_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -10445,7 +10916,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -10467,13 +10938,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB40_1 +; GFX940-NEXT: s_cbranch_execnz .LBB44_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -10489,7 +10960,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -10518,13 +10989,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB40_1 +; GFX11-NEXT: s_cbranch_execnz .LBB44_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -10537,7 +11008,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -10558,13 +11029,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB40_1 +; GFX10-NEXT: s_cbranch_execnz .LBB44_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -10579,7 +11050,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -10600,13 +11071,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB40_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB44_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -10621,7 +11092,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -10640,13 +11111,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB40_1 +; GFX908-NEXT: s_cbranch_execnz .LBB44_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 @@ -10660,7 +11131,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -10681,13 +11152,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB40_1 +; GFX8-NEXT: s_cbranch_execnz .LBB44_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -10705,7 +11176,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: v_not_b32_e32 v7, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -10725,14 +11196,14 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB40_1 +; GFX7-NEXT: s_cbranch_execnz .LBB44_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -10750,7 +11221,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: v_not_b32_e32 v7, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX6-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -10771,7 +11242,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB40_1 +; GFX6-NEXT: s_cbranch_execnz .LBB44_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 @@ -10779,12 +11250,12 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val seq_cst + %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } -define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos: +define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -10803,7 +11274,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -10831,12 +11302,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB41_1 +; GFX12-NEXT: s_cbranch_execnz .LBB45_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -10852,7 +11323,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -10874,12 +11345,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB41_1 +; GFX940-NEXT: s_cbranch_execnz .LBB45_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -10895,7 +11366,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v5, v5 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -10923,12 +11394,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB41_1 +; GFX11-NEXT: s_cbranch_execnz .LBB45_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -10941,7 +11412,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -10962,12 +11433,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB41_1 +; GFX10-NEXT: s_cbranch_execnz .LBB45_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -10982,7 +11453,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11003,12 +11474,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB41_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB45_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -11023,7 +11494,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11042,12 +11513,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB41_1 +; GFX908-NEXT: s_cbranch_execnz .LBB45_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 @@ -11061,7 +11532,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11082,12 +11553,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB41_1 +; GFX8-NEXT: s_cbranch_execnz .LBB45_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -11105,7 +11576,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -11125,12 +11596,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB41_1 +; GFX7-NEXT: s_cbranch_execnz .LBB45_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -11148,7 +11619,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: v_not_b32_e32 v5, v5 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -11169,13 +11640,13 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB41_1 +; GFX6-NEXT: s_cbranch_execnz .LBB45_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -11183,8 +11654,8 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; <2 x half> ; -------------------------------------------------------------------- -define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_v2f16: +define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -11194,7 +11665,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -11210,19 +11681,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB42_1 +; GFX12-NEXT: s_cbranch_execnz .LBB46_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16: +; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX940-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 @@ -11236,19 +11707,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB42_1 +; GFX940-NEXT: s_cbranch_execnz .LBB46_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16: +; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -11264,19 +11735,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB42_1 +; GFX11-NEXT: s_cbranch_execnz .LBB46_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_v2f16: +; GFX10-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -11290,19 +11761,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB42_1 +; GFX10-NEXT: s_cbranch_execnz .LBB46_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2f16: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX90A-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -11314,19 +11785,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB42_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB46_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16: +; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX908-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -11338,20 +11809,20 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB42_1 +; GFX908-NEXT: s_cbranch_execnz .LBB46_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_v2f16: +; GFX8-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v3 @@ -11366,13 +11837,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB42_1 +; GFX8-NEXT: s_cbranch_execnz .LBB46_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_v2f16: +; GFX7-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -11389,7 +11860,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -11414,14 +11885,14 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB42_1 +; GFX7-NEXT: s_cbranch_execnz .LBB46_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16: +; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -11438,7 +11909,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX6-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -11464,19 +11935,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB42_1 +; GFX6-NEXT: s_cbranch_execnz .LBB46_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } -define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos: +define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -11486,7 +11957,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -11502,19 +11973,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB43_1 +; GFX12-NEXT: s_cbranch_execnz .LBB47_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX940-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 @@ -11528,19 +11999,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB43_1 +; GFX940-NEXT: s_cbranch_execnz .LBB47_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -11556,19 +12027,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB43_1 +; GFX11-NEXT: s_cbranch_execnz .LBB47_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -11582,19 +12053,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB43_1 +; GFX10-NEXT: s_cbranch_execnz .LBB47_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX90A-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -11606,19 +12077,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB43_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB47_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX908-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -11630,13 +12101,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB43_1 +; GFX908-NEXT: s_cbranch_execnz .LBB47_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -11645,7 +12116,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -11660,12 +12131,12 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB43_1 +; GFX8-NEXT: s_cbranch_execnz .LBB47_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -11682,7 +12153,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -11707,14 +12178,14 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB43_1 +; GFX7-NEXT: s_cbranch_execnz .LBB47_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -11731,7 +12202,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX6-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -11757,7 +12228,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB43_1 +; GFX6-NEXT: s_cbranch_execnz .LBB47_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v2 @@ -11765,12 +12236,12 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } -define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg: +define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -11780,7 +12251,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -11796,19 +12267,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB44_1 +; GFX12-NEXT: s_cbranch_execnz .LBB48_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX940-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 @@ -11822,19 +12293,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB44_1 +; GFX940-NEXT: s_cbranch_execnz .LBB48_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -11850,19 +12321,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB44_1 +; GFX11-NEXT: s_cbranch_execnz .LBB48_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -11876,19 +12347,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB44_1 +; GFX10-NEXT: s_cbranch_execnz .LBB48_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -11900,19 +12371,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB44_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB48_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX908-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -11924,13 +12395,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB44_1 +; GFX908-NEXT: s_cbranch_execnz .LBB48_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 @@ -11939,7 +12410,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -11954,12 +12425,12 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB44_1 +; GFX8-NEXT: s_cbranch_execnz .LBB48_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -11980,7 +12451,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -12005,12 +12476,12 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB44_1 +; GFX7-NEXT: s_cbranch_execnz .LBB48_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -12031,7 +12502,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -12057,18 +12528,18 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB44_1 +; GFX6-NEXT: s_cbranch_execnz .LBB48_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 - %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } -define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_v2f16: +define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -12078,7 +12549,7 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX12-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 @@ -12094,18 +12565,18 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB45_1 +; GFX12-NEXT: s_cbranch_execnz .LBB49_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16: +; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX940-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12119,18 +12590,18 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB45_1 +; GFX940-NEXT: s_cbranch_execnz .LBB49_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16: +; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off ; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12146,18 +12617,18 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB45_1 +; GFX11-NEXT: s_cbranch_execnz .LBB49_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16: +; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12171,18 +12642,18 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB45_1 +; GFX10-NEXT: s_cbranch_execnz .LBB49_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12194,18 +12665,18 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB45_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16: +; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12217,19 +12688,19 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB45_1 +; GFX908-NEXT: s_cbranch_execnz .LBB49_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16: +; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -12244,12 +12715,12 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB45_1 +; GFX8-NEXT: s_cbranch_execnz .LBB49_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_v2f16: +; GFX7-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -12266,7 +12737,7 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -12291,12 +12762,12 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB45_1 +; GFX7-NEXT: s_cbranch_execnz .LBB49_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_v2f16: +; GFX6-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -12313,7 +12784,7 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -12339,17 +12810,17 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB45_1 +; GFX6-NEXT: s_cbranch_execnz .LBB49_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos: +define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -12359,7 +12830,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 @@ -12375,18 +12846,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB46_1 +; GFX12-NEXT: s_cbranch_execnz .LBB50_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX940-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12400,18 +12871,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB46_1 +; GFX940-NEXT: s_cbranch_execnz .LBB50_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12427,18 +12898,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB46_1 +; GFX11-NEXT: s_cbranch_execnz .LBB50_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12452,18 +12923,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB46_1 +; GFX10-NEXT: s_cbranch_execnz .LBB50_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12475,18 +12946,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB46_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12498,12 +12969,12 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB46_1 +; GFX908-NEXT: s_cbranch_execnz .LBB50_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -12512,7 +12983,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -12527,12 +12998,12 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB46_1 +; GFX8-NEXT: s_cbranch_execnz .LBB50_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -12549,7 +13020,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -12574,12 +13045,12 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB46_1 +; GFX7-NEXT: s_cbranch_execnz .LBB50_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -12596,7 +13067,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -12622,18 +13093,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB46_1 +; GFX6-NEXT: s_cbranch_execnz .LBB50_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg: +define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -12643,7 +13114,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 @@ -12659,18 +13130,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB47_1 +; GFX12-NEXT: s_cbranch_execnz .LBB51_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX940-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12684,18 +13155,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB47_1 +; GFX940-NEXT: s_cbranch_execnz .LBB51_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12711,18 +13182,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB47_1 +; GFX11-NEXT: s_cbranch_execnz .LBB51_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12736,18 +13207,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB47_1 +; GFX10-NEXT: s_cbranch_execnz .LBB51_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12759,18 +13230,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB47_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12782,12 +13253,12 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB47_1 +; GFX908-NEXT: s_cbranch_execnz .LBB51_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 @@ -12796,7 +13267,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -12811,12 +13282,12 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB47_1 +; GFX8-NEXT: s_cbranch_execnz .LBB51_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -12837,7 +13308,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -12862,12 +13333,12 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB47_1 +; GFX7-NEXT: s_cbranch_execnz .LBB51_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -12888,7 +13359,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -12914,18 +13385,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB47_1 +; GFX6-NEXT: s_cbranch_execnz .LBB51_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 - %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos: +define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -12935,7 +13406,7 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -12951,19 +13422,19 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB48_1 +; GFX12-NEXT: s_cbranch_execnz .LBB52_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX940-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 @@ -12977,19 +13448,19 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB48_1 +; GFX940-NEXT: s_cbranch_execnz .LBB52_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -13005,19 +13476,19 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB48_1 +; GFX11-NEXT: s_cbranch_execnz .LBB52_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -13031,19 +13502,19 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB48_1 +; GFX10-NEXT: s_cbranch_execnz .LBB52_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -13057,19 +13528,19 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB48_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -13081,13 +13552,13 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB48_1 +; GFX908-NEXT: s_cbranch_execnz .LBB52_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -13096,7 +13567,7 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -13111,12 +13582,12 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB48_1 +; GFX8-NEXT: s_cbranch_execnz .LBB52_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -13133,7 +13604,7 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -13158,14 +13629,14 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB48_1 +; GFX7-NEXT: s_cbranch_execnz .LBB52_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -13182,7 +13653,7 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX6-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -13208,7 +13679,7 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB48_1 +; GFX6-NEXT: s_cbranch_execnz .LBB52_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v2 @@ -13216,12 +13687,12 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val seq_cst + %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } -define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos: +define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -13231,7 +13702,7 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 @@ -13247,18 +13718,18 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB49_1 +; GFX12-NEXT: s_cbranch_execnz .LBB53_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX940-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13272,18 +13743,18 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB49_1 +; GFX940-NEXT: s_cbranch_execnz .LBB53_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13299,18 +13770,18 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB49_1 +; GFX11-NEXT: s_cbranch_execnz .LBB53_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13324,18 +13795,18 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB49_1 +; GFX10-NEXT: s_cbranch_execnz .LBB53_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13349,18 +13820,18 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13372,12 +13843,12 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB49_1 +; GFX908-NEXT: s_cbranch_execnz .LBB53_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -13386,7 +13857,7 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -13401,12 +13872,12 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB49_1 +; GFX8-NEXT: s_cbranch_execnz .LBB53_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -13423,7 +13894,7 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -13448,12 +13919,12 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB49_1 +; GFX7-NEXT: s_cbranch_execnz .LBB53_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -13470,7 +13941,7 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -13496,13 +13967,13 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB49_1 +; GFX6-NEXT: s_cbranch_execnz .LBB53_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -13510,8 +13981,8 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace( ; <2 x bfloat> ; -------------------------------------------------------------------- -define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_v2bf16: +define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -13522,7 +13993,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v3 @@ -13555,13 +14026,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB50_1 +; GFX12-NEXT: s_cbranch_execnz .LBB54_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16: +; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off @@ -13570,7 +14041,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v3 @@ -13597,13 +14068,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB50_1 +; GFX940-NEXT: s_cbranch_execnz .LBB54_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_v2bf16: +; GFX11-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off @@ -13612,7 +14083,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v3 @@ -13645,21 +14116,21 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB50_1 +; GFX11-NEXT: s_cbranch_execnz .LBB54_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_v2bf16: +; GFX10-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -13686,13 +14157,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB50_1 +; GFX10-NEXT: s_cbranch_execnz .LBB54_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2bf16: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off @@ -13701,7 +14172,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -13726,13 +14197,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_v2bf16: +; GFX908-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off @@ -13741,7 +14212,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -13766,20 +14237,20 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB50_1 +; GFX908-NEXT: s_cbranch_execnz .LBB54_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_v2bf16: +; GFX8-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v3 @@ -13807,13 +14278,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB50_1 +; GFX8-NEXT: s_cbranch_execnz .LBB54_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_v2bf16: +; GFX7-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -13829,7 +14300,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -13851,14 +14322,14 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB50_1 +; GFX7-NEXT: s_cbranch_execnz .LBB54_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_v2bf16: +; GFX6-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -13874,7 +14345,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -13897,19 +14368,19 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB50_1 +; GFX6-NEXT: s_cbranch_execnz .LBB54_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } -define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos: +define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -13920,7 +14391,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v3 @@ -13953,13 +14424,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB51_1 +; GFX12-NEXT: s_cbranch_execnz .LBB55_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -13968,7 +14439,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v3 @@ -13995,13 +14466,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB51_1 +; GFX940-NEXT: s_cbranch_execnz .LBB55_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 @@ -14010,7 +14481,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v3 @@ -14043,21 +14514,21 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB51_1 +; GFX11-NEXT: s_cbranch_execnz .LBB55_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -14084,13 +14555,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB51_1 +; GFX10-NEXT: s_cbranch_execnz .LBB55_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -14099,7 +14570,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -14124,13 +14595,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -14139,7 +14610,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -14164,13 +14635,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB51_1 +; GFX908-NEXT: s_cbranch_execnz .LBB55_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -14179,7 +14650,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -14207,12 +14678,12 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB51_1 +; GFX8-NEXT: s_cbranch_execnz .LBB55_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -14228,7 +14699,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -14250,14 +14721,14 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB51_1 +; GFX7-NEXT: s_cbranch_execnz .LBB55_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -14273,7 +14744,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -14296,7 +14767,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB51_1 +; GFX6-NEXT: s_cbranch_execnz .LBB55_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 @@ -14304,12 +14775,12 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } -define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg: +define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -14320,7 +14791,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v3 @@ -14353,13 +14824,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB52_1 +; GFX12-NEXT: s_cbranch_execnz .LBB56_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -14368,7 +14839,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v3 @@ -14395,13 +14866,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB52_1 +; GFX940-NEXT: s_cbranch_execnz .LBB56_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 @@ -14410,7 +14881,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v3 @@ -14443,21 +14914,21 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB52_1 +; GFX11-NEXT: s_cbranch_execnz .LBB56_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -14484,13 +14955,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB52_1 +; GFX10-NEXT: s_cbranch_execnz .LBB56_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -14499,7 +14970,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -14524,13 +14995,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -14539,7 +15010,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -14564,13 +15035,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB52_1 +; GFX908-NEXT: s_cbranch_execnz .LBB56_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 @@ -14579,7 +15050,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -14607,12 +15078,12 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB52_1 +; GFX8-NEXT: s_cbranch_execnz .LBB56_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -14632,7 +15103,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -14654,12 +15125,12 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB52_1 +; GFX7-NEXT: s_cbranch_execnz .LBB56_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -14679,7 +15150,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -14702,18 +15173,18 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB52_1 +; GFX6-NEXT: s_cbranch_execnz .LBB56_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512 - %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } -define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_v2bf16: +define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -14724,7 +15195,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14756,12 +15227,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB53_1 +; GFX12-NEXT: s_cbranch_execnz .LBB57_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16: +; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off @@ -14770,7 +15241,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14797,12 +15268,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB53_1 +; GFX940-NEXT: s_cbranch_execnz .LBB57_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16: +; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off @@ -14811,7 +15282,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14843,20 +15314,20 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB53_1 +; GFX11-NEXT: s_cbranch_execnz .LBB57_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16: +; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14883,12 +15354,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB53_1 +; GFX10-NEXT: s_cbranch_execnz .LBB57_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off @@ -14897,7 +15368,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14922,12 +15393,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16: +; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off @@ -14936,7 +15407,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14961,19 +15432,19 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB53_1 +; GFX908-NEXT: s_cbranch_execnz .LBB57_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16: +; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15001,12 +15472,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB53_1 +; GFX8-NEXT: s_cbranch_execnz .LBB57_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_v2bf16: +; GFX7-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -15022,7 +15493,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -15044,12 +15515,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB53_1 +; GFX7-NEXT: s_cbranch_execnz .LBB57_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_v2bf16: +; GFX6-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -15065,7 +15536,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -15088,17 +15559,17 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB53_1 +; GFX6-NEXT: s_cbranch_execnz .LBB57_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos: +define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -15109,7 +15580,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15141,12 +15612,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB54_1 +; GFX12-NEXT: s_cbranch_execnz .LBB58_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -15155,7 +15626,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15182,12 +15653,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB54_1 +; GFX940-NEXT: s_cbranch_execnz .LBB58_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 @@ -15196,7 +15667,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15228,20 +15699,20 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB54_1 +; GFX11-NEXT: s_cbranch_execnz .LBB58_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15268,12 +15739,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB54_1 +; GFX10-NEXT: s_cbranch_execnz .LBB58_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -15282,7 +15753,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15307,12 +15778,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB58_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -15321,7 +15792,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15346,12 +15817,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB54_1 +; GFX908-NEXT: s_cbranch_execnz .LBB58_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -15360,7 +15831,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15388,12 +15859,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB54_1 +; GFX8-NEXT: s_cbranch_execnz .LBB58_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -15409,7 +15880,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -15431,12 +15902,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB54_1 +; GFX7-NEXT: s_cbranch_execnz .LBB58_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -15452,7 +15923,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -15475,18 +15946,18 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB54_1 +; GFX6-NEXT: s_cbranch_execnz .LBB58_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg: +define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -15497,7 +15968,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15529,12 +16000,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB55_1 +; GFX12-NEXT: s_cbranch_execnz .LBB59_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -15543,7 +16014,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15570,12 +16041,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB55_1 +; GFX940-NEXT: s_cbranch_execnz .LBB59_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 @@ -15584,7 +16055,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15616,20 +16087,20 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB55_1 +; GFX11-NEXT: s_cbranch_execnz .LBB59_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15656,12 +16127,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB55_1 +; GFX10-NEXT: s_cbranch_execnz .LBB59_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -15670,7 +16141,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15695,12 +16166,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB59_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -15709,7 +16180,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15734,12 +16205,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB55_1 +; GFX908-NEXT: s_cbranch_execnz .LBB59_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 @@ -15748,7 +16219,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15776,12 +16247,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB55_1 +; GFX8-NEXT: s_cbranch_execnz .LBB59_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -15801,7 +16272,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -15823,12 +16294,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB55_1 +; GFX7-NEXT: s_cbranch_execnz .LBB59_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -15848,7 +16319,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -15871,18 +16342,18 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB55_1 +; GFX6-NEXT: s_cbranch_execnz .LBB59_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512 - %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos: +define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -15893,7 +16364,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v3 @@ -15926,13 +16397,13 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB56_1 +; GFX12-NEXT: s_cbranch_execnz .LBB60_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -15941,7 +16412,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v3 @@ -15968,13 +16439,13 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB56_1 +; GFX940-NEXT: s_cbranch_execnz .LBB60_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 @@ -15983,7 +16454,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v3 @@ -16016,21 +16487,21 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB56_1 +; GFX11-NEXT: s_cbranch_execnz .LBB60_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -16057,13 +16528,13 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB56_1 +; GFX10-NEXT: s_cbranch_execnz .LBB60_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -16072,7 +16543,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -16099,13 +16570,13 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB60_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -16114,7 +16585,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -16139,13 +16610,13 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB56_1 +; GFX908-NEXT: s_cbranch_execnz .LBB60_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -16154,7 +16625,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -16182,12 +16653,12 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB56_1 +; GFX8-NEXT: s_cbranch_execnz .LBB60_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -16203,7 +16674,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -16225,14 +16696,14 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB56_1 +; GFX7-NEXT: s_cbranch_execnz .LBB60_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -16248,7 +16719,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -16271,7 +16742,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB56_1 +; GFX6-NEXT: s_cbranch_execnz .LBB60_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 @@ -16279,12 +16750,12 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst + %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } -define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos: +define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -16295,7 +16766,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16327,12 +16798,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB57_1 +; GFX12-NEXT: s_cbranch_execnz .LBB61_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -16341,7 +16812,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16368,12 +16839,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB57_1 +; GFX940-NEXT: s_cbranch_execnz .LBB61_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 @@ -16382,7 +16853,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16414,20 +16885,20 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB57_1 +; GFX11-NEXT: s_cbranch_execnz .LBB61_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16454,12 +16925,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB57_1 +; GFX10-NEXT: s_cbranch_execnz .LBB61_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -16468,7 +16939,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16495,12 +16966,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB61_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -16509,7 +16980,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16534,12 +17005,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB57_1 +; GFX908-NEXT: s_cbranch_execnz .LBB61_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -16548,7 +17019,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16576,12 +17047,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB57_1 +; GFX8-NEXT: s_cbranch_execnz .LBB61_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -16597,7 +17068,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -16619,12 +17090,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB57_1 +; GFX7-NEXT: s_cbranch_execnz .LBB61_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -16640,7 +17111,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -16663,15 +17134,17 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB57_1 +; GFX6-NEXT: s_cbranch_execnz .LBB61_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" } -attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } +attributes #0 = { nounwind } +attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } + +!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll index 954d810d7aa4e..31717bf1c8253 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll @@ -1148,55 +1148,29 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB2_3 +; GFX1064-NEXT: s_cbranch_execz .LBB2_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1064-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1064-NEXT: .LBB2_3: +; GFX1064-NEXT: global_atomic_fmax v0, v1, s[0:1] +; GFX1064-NEXT: .LBB2_2: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB2_3 +; GFX1032-NEXT: s_cbranch_execz .LBB2_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 -; GFX1032-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1032-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1032-NEXT: .LBB2_3: +; GFX1032-NEXT: global_atomic_fmax v0, v1, s[0:1] +; GFX1032-NEXT: .LBB2_2: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe: @@ -1206,60 +1180,33 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB2_3 +; GFX1164-NEXT: s_cbranch_execz .LBB2_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1164-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1164-NEXT: .LBB2_3: +; GFX1164-NEXT: global_atomic_max_f32 v0, v1, s[0:1] +; GFX1164-NEXT: .LBB2_2: +; GFX1164-NEXT: s_nop 0 +; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB2_3 +; GFX1132-NEXT: s_cbranch_execz .LBB2_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s2 -; GFX1132-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1132-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX1132-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1132-NEXT: .LBB2_3: +; GFX1132-NEXT: global_atomic_max_f32 v0, v1, s[0:1] +; GFX1132-NEXT: .LBB2_2: +; GFX1132-NEXT: s_nop 0 +; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm ; ; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe: @@ -1330,55 +1277,29 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_2 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1064-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1064-DPP-NEXT: .LBB2_3: +; GFX1064-DPP-NEXT: global_atomic_fmax v0, v1, s[0:1] +; GFX1064-DPP-NEXT: .LBB2_2: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_2 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1032-DPP-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1032-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1032-DPP-NEXT: .LBB2_3: +; GFX1032-DPP-NEXT: global_atomic_fmax v0, v1, s[0:1] +; GFX1032-DPP-NEXT: .LBB2_2: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe: @@ -1388,60 +1309,33 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB2_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1164-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1164-DPP-NEXT: .LBB2_3: +; GFX1164-DPP-NEXT: global_atomic_max_f32 v0, v1, s[0:1] +; GFX1164-DPP-NEXT: .LBB2_2: +; GFX1164-DPP-NEXT: s_nop 0 +; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB2_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1132-DPP-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1132-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1132-DPP-NEXT: .LBB2_3: +; GFX1132-DPP-NEXT: global_atomic_max_f32 v0, v1, s[0:1] +; GFX1132-DPP-NEXT: .LBB2_2: +; GFX1132-DPP-NEXT: s_nop 0 +; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 syncscope("one-as") monotonic ret void @@ -1623,18 +1517,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1064-NEXT: s_mov_b32 s32, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX1064-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: v_max_f32_e32 v2, v1, v2 +; GFX1064-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1642,27 +1536,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB3_5 +; GFX1064-NEXT: s_cbranch_execz .LBB3_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dword v1, v3, s[0:1] -; GFX1064-NEXT: .LBB3_4: ; %atomicrmw.start -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1064-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB3_4 -; GFX1064-NEXT: .LBB3_5: +; GFX1064-NEXT: global_atomic_fmax v0, v1, s[0:1] +; GFX1064-NEXT: .LBB3_4: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe: @@ -1693,45 +1573,31 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1032-NEXT: s_mov_b32 s32, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: .LBB3_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 -; GFX1032-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX1032-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2 -; GFX1032-NEXT: v_max_f32_e32 v2, v1, v2 +; GFX1032-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB3_5 +; GFX1032-NEXT: s_cbranch_execz .LBB3_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dword v1, v3, s[0:1] -; GFX1032-NEXT: .LBB3_4: ; %atomicrmw.start -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1032-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB3_4 -; GFX1032-NEXT: .LBB3_5: +; GFX1032-NEXT: global_atomic_fmax v0, v1, s[0:1] +; GFX1032-NEXT: .LBB3_4: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe: @@ -1752,13 +1618,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1164-NEXT: s_mov_b32 s32, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX1164-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: .LBB3_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX1164-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3 @@ -1766,7 +1632,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: v_max_f32_e32 v2, v1, v2 +; GFX1164-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1775,29 +1641,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB3_5 +; GFX1164-NEXT: s_cbranch_execz .LBB3_4 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: global_load_b32 v1, v3, s[0:1] -; GFX1164-NEXT: .LBB3_4: ; %atomicrmw.start -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB3_4 -; GFX1164-NEXT: .LBB3_5: +; GFX1164-NEXT: global_atomic_max_f32 v0, v1, s[0:1] +; GFX1164-NEXT: .LBB3_4: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe: @@ -1818,13 +1668,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX1132-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX1132-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: .LBB3_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 -; GFX1132-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -1832,36 +1682,21 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2 -; GFX1132-NEXT: v_max_f32_e32 v2, v1, v2 +; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, 0 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB3_5 +; GFX1132-NEXT: s_cbranch_execz .LBB3_4 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_max_f32 v2, v2, v2 +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: global_load_b32 v1, v3, s[0:1] -; GFX1132-NEXT: .LBB3_4: ; %atomicrmw.start -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB3_4 -; GFX1132-NEXT: .LBB3_5: +; GFX1132-NEXT: global_atomic_max_f32 v0, v1, s[0:1] +; GFX1132-NEXT: .LBB3_4: ; GFX1132-NEXT: s_endpgm ; ; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe: @@ -2080,27 +1915,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB3_2 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-DPP-NEXT: v_max_f32_e32 v6, v0, v0 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: global_load_dword v1, v2, s[0:1] -; GFX1064-DPP-NEXT: .LBB3_2: ; %atomicrmw.start -; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v0, v6 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB3_2 -; GFX1064-DPP-NEXT: .LBB3_3: +; GFX1064-DPP-NEXT: global_atomic_fmax v1, v0, s[0:1] +; GFX1064-DPP-NEXT: .LBB3_2: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe: @@ -2162,29 +1983,15 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB3_2 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-DPP-NEXT: v_max_f32_e32 v6, v0, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: global_load_dword v1, v2, s[0:1] -; GFX1032-DPP-NEXT: .LBB3_2: ; %atomicrmw.start -; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v0, v6 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB3_2 -; GFX1032-DPP-NEXT: .LBB3_3: +; GFX1032-DPP-NEXT: global_atomic_fmax v1, v0, s[0:1] +; GFX1032-DPP-NEXT: .LBB3_2: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe: @@ -2252,34 +2059,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-DPP-NEXT: v_max_f32_e32 v6, v4, v4 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_load_b32 v5, v0, s[0:1] -; GFX1164-DPP-NEXT: .LBB3_2: ; %atomicrmw.start -; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v4, v4, v6 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB3_2 -; GFX1164-DPP-NEXT: .LBB3_3: +; GFX1164-DPP-NEXT: global_atomic_max_f32 v4, v0, s[0:1] +; GFX1164-DPP-NEXT: .LBB3_2: ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe: @@ -2335,34 +2126,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v1 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB3_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-DPP-NEXT: v_max_f32_e32 v6, v4, v4 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_load_b32 v5, v0, s[0:1] -; GFX1132-DPP-NEXT: .LBB3_2: ; %atomicrmw.start -; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v4, v4, v6 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB3_2 -; GFX1132-DPP-NEXT: .LBB3_3: +; GFX1132-DPP-NEXT: global_atomic_max_f32 v4, v0, s[0:1] +; GFX1132-DPP-NEXT: .LBB3_2: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call float @div.float.value() %result = atomicrmw fmax ptr addrspace(1) %ptr, float %divValue syncscope("one-as") monotonic @@ -2439,55 +2214,29 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB4_3 +; GFX1064-NEXT: s_cbranch_execz .LBB4_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1064-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1064-NEXT: .LBB4_3: +; GFX1064-NEXT: global_atomic_fmax v0, v1, s[0:1] +; GFX1064-NEXT: .LBB4_2: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB4_3 +; GFX1032-NEXT: s_cbranch_execz .LBB4_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 -; GFX1032-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1032-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1032-NEXT: .LBB4_3: +; GFX1032-NEXT: global_atomic_fmax v0, v1, s[0:1] +; GFX1032-NEXT: .LBB4_2: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe: @@ -2497,60 +2246,33 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB4_3 +; GFX1164-NEXT: s_cbranch_execz .LBB4_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1164-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1164-NEXT: .LBB4_3: +; GFX1164-NEXT: global_atomic_max_f32 v0, v1, s[0:1] +; GFX1164-NEXT: .LBB4_2: +; GFX1164-NEXT: s_nop 0 +; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB4_3 +; GFX1132-NEXT: s_cbranch_execz .LBB4_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s2 -; GFX1132-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1132-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX1132-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1132-NEXT: .LBB4_3: +; GFX1132-NEXT: global_atomic_max_f32 v0, v1, s[0:1] +; GFX1132-NEXT: .LBB4_2: +; GFX1132-NEXT: s_nop 0 +; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm ; ; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe: @@ -2621,55 +2343,29 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_2 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1064-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1064-DPP-NEXT: .LBB4_3: +; GFX1064-DPP-NEXT: global_atomic_fmax v0, v1, s[0:1] +; GFX1064-DPP-NEXT: .LBB4_2: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_2 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1032-DPP-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1032-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1032-DPP-NEXT: .LBB4_3: +; GFX1032-DPP-NEXT: global_atomic_fmax v0, v1, s[0:1] +; GFX1032-DPP-NEXT: .LBB4_2: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe: @@ -2679,60 +2375,33 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB4_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1164-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1164-DPP-NEXT: .LBB4_3: +; GFX1164-DPP-NEXT: global_atomic_max_f32 v0, v1, s[0:1] +; GFX1164-DPP-NEXT: .LBB4_2: +; GFX1164-DPP-NEXT: s_nop 0 +; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1132-DPP-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1132-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1132-DPP-NEXT: .LBB4_3: +; GFX1132-DPP-NEXT: global_atomic_max_f32 v0, v1, s[0:1] +; GFX1132-DPP-NEXT: .LBB4_2: +; GFX1132-DPP-NEXT: s_nop 0 +; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 monotonic, align 4 ret void @@ -2913,18 +2582,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1064-NEXT: s_mov_b32 s32, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX1064-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: v_max_f32_e32 v2, v1, v2 +; GFX1064-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2932,27 +2601,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB5_5 +; GFX1064-NEXT: s_cbranch_execz .LBB5_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dword v1, v3, s[0:1] -; GFX1064-NEXT: .LBB5_4: ; %atomicrmw.start -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1064-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB5_4 -; GFX1064-NEXT: .LBB5_5: +; GFX1064-NEXT: global_atomic_fmax v0, v1, s[0:1] +; GFX1064-NEXT: .LBB5_4: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe: @@ -2983,45 +2638,31 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1032-NEXT: s_mov_b32 s32, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 -; GFX1032-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX1032-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2 -; GFX1032-NEXT: v_max_f32_e32 v2, v1, v2 +; GFX1032-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB5_5 +; GFX1032-NEXT: s_cbranch_execz .LBB5_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dword v1, v3, s[0:1] -; GFX1032-NEXT: .LBB5_4: ; %atomicrmw.start -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1032-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB5_4 -; GFX1032-NEXT: .LBB5_5: +; GFX1032-NEXT: global_atomic_fmax v0, v1, s[0:1] +; GFX1032-NEXT: .LBB5_4: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe: @@ -3042,13 +2683,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1164-NEXT: s_mov_b32 s32, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX1164-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX1164-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3 @@ -3056,7 +2697,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: v_max_f32_e32 v2, v1, v2 +; GFX1164-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3065,29 +2706,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB5_5 +; GFX1164-NEXT: s_cbranch_execz .LBB5_4 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: global_load_b32 v1, v3, s[0:1] -; GFX1164-NEXT: .LBB5_4: ; %atomicrmw.start -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB5_4 -; GFX1164-NEXT: .LBB5_5: +; GFX1164-NEXT: global_atomic_max_f32 v0, v1, s[0:1] +; GFX1164-NEXT: .LBB5_4: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe: @@ -3108,13 +2733,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX1132-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX1132-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 -; GFX1132-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -3122,36 +2747,21 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2 -; GFX1132-NEXT: v_max_f32_e32 v2, v1, v2 +; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, 0 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB5_5 +; GFX1132-NEXT: s_cbranch_execz .LBB5_4 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_max_f32 v2, v2, v2 +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: global_load_b32 v1, v3, s[0:1] -; GFX1132-NEXT: .LBB5_4: ; %atomicrmw.start -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB5_4 -; GFX1132-NEXT: .LBB5_5: +; GFX1132-NEXT: global_atomic_max_f32 v0, v1, s[0:1] +; GFX1132-NEXT: .LBB5_4: ; GFX1132-NEXT: s_endpgm ; ; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe: @@ -3370,27 +2980,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB5_2 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-DPP-NEXT: v_max_f32_e32 v6, v0, v0 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: global_load_dword v1, v2, s[0:1] -; GFX1064-DPP-NEXT: .LBB5_2: ; %atomicrmw.start -; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v0, v6 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB5_2 -; GFX1064-DPP-NEXT: .LBB5_3: +; GFX1064-DPP-NEXT: global_atomic_fmax v1, v0, s[0:1] +; GFX1064-DPP-NEXT: .LBB5_2: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe: @@ -3452,29 +3048,15 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB5_2 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-DPP-NEXT: v_max_f32_e32 v6, v0, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: global_load_dword v1, v2, s[0:1] -; GFX1032-DPP-NEXT: .LBB5_2: ; %atomicrmw.start -; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v0, v6 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB5_2 -; GFX1032-DPP-NEXT: .LBB5_3: +; GFX1032-DPP-NEXT: global_atomic_fmax v1, v0, s[0:1] +; GFX1032-DPP-NEXT: .LBB5_2: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe: @@ -3542,34 +3124,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-DPP-NEXT: v_max_f32_e32 v6, v4, v4 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_load_b32 v5, v0, s[0:1] -; GFX1164-DPP-NEXT: .LBB5_2: ; %atomicrmw.start -; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v4, v4, v6 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB5_2 -; GFX1164-DPP-NEXT: .LBB5_3: +; GFX1164-DPP-NEXT: global_atomic_max_f32 v4, v0, s[0:1] +; GFX1164-DPP-NEXT: .LBB5_2: ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe: @@ -3625,34 +3191,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v1 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB5_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-DPP-NEXT: v_max_f32_e32 v6, v4, v4 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_load_b32 v5, v0, s[0:1] -; GFX1132-DPP-NEXT: .LBB5_2: ; %atomicrmw.start -; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v4, v4, v6 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB5_2 -; GFX1132-DPP-NEXT: .LBB5_3: +; GFX1132-DPP-NEXT: global_atomic_max_f32 v4, v0, s[0:1] +; GFX1132-DPP-NEXT: .LBB5_2: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call float @div.float.value() %result = atomicrmw fmax ptr addrspace(1) %ptr, float %divValue monotonic, align 4 @@ -6003,59 +5553,31 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB8_3 +; GFX1064-NEXT: s_cbranch_execz .LBB8_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v2, s2 -; GFX1064-NEXT: v_mov_b32_e32 v3, s3 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB8_2: ; %atomicrmw.start -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 -; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1064-NEXT: v_mov_b32_e32 v3, v1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB8_2 -; GFX1064-NEXT: .LBB8_3: +; GFX1064-NEXT: global_atomic_fmax_x2 v2, v[0:1], s[0:1] +; GFX1064-NEXT: .LBB8_2: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB8_3 +; GFX1032-NEXT: s_cbranch_execz .LBB8_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v2, s2 -; GFX1032-NEXT: v_mov_b32_e32 v3, s3 -; GFX1032-NEXT: .LBB8_2: ; %atomicrmw.start -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 -; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1032-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: s_cbranch_execnz .LBB8_2 -; GFX1032-NEXT: .LBB8_3: +; GFX1032-NEXT: global_atomic_fmax_x2 v2, v[0:1], s[0:1] +; GFX1032-NEXT: .LBB8_2: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: @@ -6197,59 +5719,31 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB8_2 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB8_2: ; %atomicrmw.start -; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB8_2 -; GFX1064-DPP-NEXT: .LBB8_3: +; GFX1064-DPP-NEXT: global_atomic_fmax_x2 v2, v[0:1], s[0:1] +; GFX1064-DPP-NEXT: .LBB8_2: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB8_2 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s2 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s3 -; GFX1032-DPP-NEXT: .LBB8_2: ; %atomicrmw.start -; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB8_2 -; GFX1032-DPP-NEXT: .LBB8_3: +; GFX1032-DPP-NEXT: global_atomic_fmax_x2 v2, v[0:1], s[0:1] +; GFX1032-DPP-NEXT: .LBB8_2: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: @@ -6503,20 +5997,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1064-NEXT: s_mov_b32 s32, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: .LBB9_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX1064-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 ; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 ; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] +; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6524,28 +6018,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB9_5 +; GFX1064-NEXT: s_cbranch_execz .LBB9_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX1064-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] -; GFX1064-NEXT: .LBB9_4: ; %atomicrmw.start -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1064-NEXT: v_mov_b32_e32 v3, v1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB9_4 -; GFX1064-NEXT: .LBB9_5: +; GFX1064-NEXT: global_atomic_fmax_x2 v0, v[2:3], s[0:1] +; GFX1064-NEXT: .LBB9_4: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: @@ -6576,48 +6055,33 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1032-NEXT: s_mov_b32 s32, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: .LBB9_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 -; GFX1032-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1032-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] +; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1032-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB9_5 +; GFX1032-NEXT: s_cbranch_execz .LBB9_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX1032-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] -; GFX1032-NEXT: .LBB9_4: ; %atomicrmw.start -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1032-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB9_4 -; GFX1032-NEXT: .LBB9_5: +; GFX1032-NEXT: global_atomic_fmax_x2 v0, v[2:3], s[0:1] +; GFX1032-NEXT: .LBB9_4: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: @@ -7008,28 +6472,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_2 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] -; GFX1064-DPP-NEXT: .LBB9_2: ; %atomicrmw.start -; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_max_f64 v[9:10], v[0:1], v[0:1] -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_max_f64 v[13:14], v[11:12], v[11:12] -; GFX1064-DPP-NEXT: v_max_f64 v[9:10], v[13:14], v[9:10] -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_2 -; GFX1064-DPP-NEXT: .LBB9_3: +; GFX1064-DPP-NEXT: global_atomic_fmax_x2 v2, v[0:1], s[0:1] +; GFX1064-DPP-NEXT: .LBB9_2: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: @@ -7104,30 +6553,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_2 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] -; GFX1032-DPP-NEXT: .LBB9_2: ; %atomicrmw.start -; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_max_f64 v[9:10], v[11:12], v[11:12] -; GFX1032-DPP-NEXT: v_max_f64 v[9:10], v[9:10], v[0:1] -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_2 -; GFX1032-DPP-NEXT: .LBB9_3: +; GFX1032-DPP-NEXT: global_atomic_fmax_x2 v2, v[0:1], s[0:1] +; GFX1032-DPP-NEXT: .LBB9_2: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: @@ -9680,55 +9114,29 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB12_3 +; GFX1064-NEXT: s_cbranch_execz .LBB12_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB12_2: ; %atomicrmw.start -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1064-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB12_2 -; GFX1064-NEXT: .LBB12_3: +; GFX1064-NEXT: global_atomic_fmax v0, v1, s[0:1] +; GFX1064-NEXT: .LBB12_2: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB12_3 +; GFX1032-NEXT: s_cbranch_execz .LBB12_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 -; GFX1032-NEXT: .LBB12_2: ; %atomicrmw.start -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1032-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: s_cbranch_execnz .LBB12_2 -; GFX1032-NEXT: .LBB12_3: +; GFX1032-NEXT: global_atomic_fmax v0, v1, s[0:1] +; GFX1032-NEXT: .LBB12_2: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -9738,60 +9146,33 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB12_3 +; GFX1164-NEXT: s_cbranch_execz .LBB12_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB12_2: ; %atomicrmw.start -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1164-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB12_2 -; GFX1164-NEXT: .LBB12_3: +; GFX1164-NEXT: global_atomic_max_f32 v0, v1, s[0:1] +; GFX1164-NEXT: .LBB12_2: +; GFX1164-NEXT: s_nop 0 +; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB12_3 +; GFX1132-NEXT: s_cbranch_execz .LBB12_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s2 -; GFX1132-NEXT: .LBB12_2: ; %atomicrmw.start -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1132-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX1132-NEXT: s_cbranch_execnz .LBB12_2 -; GFX1132-NEXT: .LBB12_3: +; GFX1132-NEXT: global_atomic_max_f32 v0, v1, s[0:1] +; GFX1132-NEXT: .LBB12_2: +; GFX1132-NEXT: s_nop 0 +; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm ; ; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -9862,55 +9243,29 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB12_2 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB12_2: ; %atomicrmw.start -; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1064-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB12_2 -; GFX1064-DPP-NEXT: .LBB12_3: +; GFX1064-DPP-NEXT: global_atomic_fmax v0, v1, s[0:1] +; GFX1064-DPP-NEXT: .LBB12_2: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB12_2 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1032-DPP-NEXT: .LBB12_2: ; %atomicrmw.start -; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1032-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_2 -; GFX1032-DPP-NEXT: .LBB12_3: +; GFX1032-DPP-NEXT: global_atomic_fmax v0, v1, s[0:1] +; GFX1032-DPP-NEXT: .LBB12_2: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -9920,60 +9275,33 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB12_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: .LBB12_2: ; %atomicrmw.start -; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1164-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB12_2 -; GFX1164-DPP-NEXT: .LBB12_3: +; GFX1164-DPP-NEXT: global_atomic_max_f32 v0, v1, s[0:1] +; GFX1164-DPP-NEXT: .LBB12_2: +; GFX1164-DPP-NEXT: s_nop 0 +; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1132-DPP-NEXT: .LBB12_2: ; %atomicrmw.start -; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1132-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_2 -; GFX1132-DPP-NEXT: .LBB12_3: +; GFX1132-DPP-NEXT: global_atomic_max_f32 v0, v1, s[0:1] +; GFX1132-DPP-NEXT: .LBB12_2: +; GFX1132-DPP-NEXT: s_nop 0 +; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory !1, !amdgpu.ignore.denormal.mode !1 ret void @@ -10048,55 +9376,29 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB13_3 +; GFX1064-NEXT: s_cbranch_execz .LBB13_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB13_2: ; %atomicrmw.start -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1064-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB13_2 -; GFX1064-NEXT: .LBB13_3: +; GFX1064-NEXT: global_atomic_fmax v0, v1, s[0:1] +; GFX1064-NEXT: .LBB13_2: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB13_3 +; GFX1032-NEXT: s_cbranch_execz .LBB13_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 -; GFX1032-NEXT: .LBB13_2: ; %atomicrmw.start -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1032-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: s_cbranch_execnz .LBB13_2 -; GFX1032-NEXT: .LBB13_3: +; GFX1032-NEXT: global_atomic_fmax v0, v1, s[0:1] +; GFX1032-NEXT: .LBB13_2: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -10106,60 +9408,33 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB13_3 +; GFX1164-NEXT: s_cbranch_execz .LBB13_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB13_2: ; %atomicrmw.start -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1164-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB13_2 -; GFX1164-NEXT: .LBB13_3: +; GFX1164-NEXT: global_atomic_max_f32 v0, v1, s[0:1] +; GFX1164-NEXT: .LBB13_2: +; GFX1164-NEXT: s_nop 0 +; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB13_3 +; GFX1132-NEXT: s_cbranch_execz .LBB13_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s2 -; GFX1132-NEXT: .LBB13_2: ; %atomicrmw.start -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1132-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX1132-NEXT: s_cbranch_execnz .LBB13_2 -; GFX1132-NEXT: .LBB13_3: +; GFX1132-NEXT: global_atomic_max_f32 v0, v1, s[0:1] +; GFX1132-NEXT: .LBB13_2: +; GFX1132-NEXT: s_nop 0 +; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm ; ; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -10230,55 +9505,29 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_2 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB13_2: ; %atomicrmw.start -; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1064-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB13_2 -; GFX1064-DPP-NEXT: .LBB13_3: +; GFX1064-DPP-NEXT: global_atomic_fmax v0, v1, s[0:1] +; GFX1064-DPP-NEXT: .LBB13_2: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_2 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1032-DPP-NEXT: .LBB13_2: ; %atomicrmw.start -; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1032-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB13_2 -; GFX1032-DPP-NEXT: .LBB13_3: +; GFX1032-DPP-NEXT: global_atomic_fmax v0, v1, s[0:1] +; GFX1032-DPP-NEXT: .LBB13_2: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -10288,60 +9537,33 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB13_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: .LBB13_2: ; %atomicrmw.start -; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1164-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB13_2 -; GFX1164-DPP-NEXT: .LBB13_3: +; GFX1164-DPP-NEXT: global_atomic_max_f32 v0, v1, s[0:1] +; GFX1164-DPP-NEXT: .LBB13_2: +; GFX1164-DPP-NEXT: s_nop 0 +; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB13_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1132-DPP-NEXT: .LBB13_2: ; %atomicrmw.start -; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1132-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB13_2 -; GFX1132-DPP-NEXT: .LBB13_3: +; GFX1132-DPP-NEXT: global_atomic_max_f32 v0, v1, s[0:1] +; GFX1132-DPP-NEXT: .LBB13_2: +; GFX1132-DPP-NEXT: s_nop 0 +; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory !1 ret void diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll index 4ab86835f90d1..6f2702eae9302 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll @@ -1148,55 +1148,29 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB2_3 +; GFX1064-NEXT: s_cbranch_execz .LBB2_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1064-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1064-NEXT: .LBB2_3: +; GFX1064-NEXT: global_atomic_fmin v0, v1, s[0:1] +; GFX1064-NEXT: .LBB2_2: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB2_3 +; GFX1032-NEXT: s_cbranch_execz .LBB2_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 -; GFX1032-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1032-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1032-NEXT: .LBB2_3: +; GFX1032-NEXT: global_atomic_fmin v0, v1, s[0:1] +; GFX1032-NEXT: .LBB2_2: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe: @@ -1206,60 +1180,33 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB2_3 +; GFX1164-NEXT: s_cbranch_execz .LBB2_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1164-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1164-NEXT: .LBB2_3: +; GFX1164-NEXT: global_atomic_min_f32 v0, v1, s[0:1] +; GFX1164-NEXT: .LBB2_2: +; GFX1164-NEXT: s_nop 0 +; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB2_3 +; GFX1132-NEXT: s_cbranch_execz .LBB2_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s2 -; GFX1132-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1132-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX1132-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1132-NEXT: .LBB2_3: +; GFX1132-NEXT: global_atomic_min_f32 v0, v1, s[0:1] +; GFX1132-NEXT: .LBB2_2: +; GFX1132-NEXT: s_nop 0 +; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm ; ; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe: @@ -1330,55 +1277,29 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_2 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1064-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1064-DPP-NEXT: .LBB2_3: +; GFX1064-DPP-NEXT: global_atomic_fmin v0, v1, s[0:1] +; GFX1064-DPP-NEXT: .LBB2_2: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_2 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1032-DPP-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1032-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1032-DPP-NEXT: .LBB2_3: +; GFX1032-DPP-NEXT: global_atomic_fmin v0, v1, s[0:1] +; GFX1032-DPP-NEXT: .LBB2_2: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe: @@ -1388,60 +1309,33 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB2_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1164-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1164-DPP-NEXT: .LBB2_3: +; GFX1164-DPP-NEXT: global_atomic_min_f32 v0, v1, s[0:1] +; GFX1164-DPP-NEXT: .LBB2_2: +; GFX1164-DPP-NEXT: s_nop 0 +; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB2_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1132-DPP-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1132-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1132-DPP-NEXT: .LBB2_3: +; GFX1132-DPP-NEXT: global_atomic_min_f32 v0, v1, s[0:1] +; GFX1132-DPP-NEXT: .LBB2_2: +; GFX1132-DPP-NEXT: s_nop 0 +; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 syncscope("one-as") monotonic ret void @@ -1623,18 +1517,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1064-NEXT: s_mov_b32 s32, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX1064-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: v_min_f32_e32 v2, v1, v2 +; GFX1064-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1642,27 +1536,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB3_5 +; GFX1064-NEXT: s_cbranch_execz .LBB3_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dword v1, v3, s[0:1] -; GFX1064-NEXT: .LBB3_4: ; %atomicrmw.start -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1064-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB3_4 -; GFX1064-NEXT: .LBB3_5: +; GFX1064-NEXT: global_atomic_fmin v0, v1, s[0:1] +; GFX1064-NEXT: .LBB3_4: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe: @@ -1693,45 +1573,31 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1032-NEXT: s_mov_b32 s32, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: .LBB3_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 -; GFX1032-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX1032-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2 -; GFX1032-NEXT: v_min_f32_e32 v2, v1, v2 +; GFX1032-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB3_5 +; GFX1032-NEXT: s_cbranch_execz .LBB3_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dword v1, v3, s[0:1] -; GFX1032-NEXT: .LBB3_4: ; %atomicrmw.start -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1032-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB3_4 -; GFX1032-NEXT: .LBB3_5: +; GFX1032-NEXT: global_atomic_fmin v0, v1, s[0:1] +; GFX1032-NEXT: .LBB3_4: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe: @@ -1752,13 +1618,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1164-NEXT: s_mov_b32 s32, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX1164-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: .LBB3_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX1164-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3 @@ -1766,7 +1632,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: v_min_f32_e32 v2, v1, v2 +; GFX1164-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1775,29 +1641,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB3_5 +; GFX1164-NEXT: s_cbranch_execz .LBB3_4 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: global_load_b32 v1, v3, s[0:1] -; GFX1164-NEXT: .LBB3_4: ; %atomicrmw.start -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB3_4 -; GFX1164-NEXT: .LBB3_5: +; GFX1164-NEXT: global_atomic_min_f32 v0, v1, s[0:1] +; GFX1164-NEXT: .LBB3_4: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe: @@ -1818,13 +1668,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX1132-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX1132-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: .LBB3_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 -; GFX1132-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -1832,36 +1682,21 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2 -; GFX1132-NEXT: v_min_f32_e32 v2, v1, v2 +; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, 0 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB3_5 +; GFX1132-NEXT: s_cbranch_execz .LBB3_4 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_max_f32 v2, v2, v2 +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: global_load_b32 v1, v3, s[0:1] -; GFX1132-NEXT: .LBB3_4: ; %atomicrmw.start -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB3_4 -; GFX1132-NEXT: .LBB3_5: +; GFX1132-NEXT: global_atomic_min_f32 v0, v1, s[0:1] +; GFX1132-NEXT: .LBB3_4: ; GFX1132-NEXT: s_endpgm ; ; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe: @@ -2080,27 +1915,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB3_2 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-DPP-NEXT: v_max_f32_e32 v6, v0, v0 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: global_load_dword v1, v2, s[0:1] -; GFX1064-DPP-NEXT: .LBB3_2: ; %atomicrmw.start -; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1064-DPP-NEXT: v_min_f32_e32 v0, v0, v6 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB3_2 -; GFX1064-DPP-NEXT: .LBB3_3: +; GFX1064-DPP-NEXT: global_atomic_fmin v1, v0, s[0:1] +; GFX1064-DPP-NEXT: .LBB3_2: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe: @@ -2162,29 +1983,15 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB3_2 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-DPP-NEXT: v_max_f32_e32 v6, v0, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: global_load_dword v1, v2, s[0:1] -; GFX1032-DPP-NEXT: .LBB3_2: ; %atomicrmw.start -; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1032-DPP-NEXT: v_min_f32_e32 v0, v0, v6 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB3_2 -; GFX1032-DPP-NEXT: .LBB3_3: +; GFX1032-DPP-NEXT: global_atomic_fmin v1, v0, s[0:1] +; GFX1032-DPP-NEXT: .LBB3_2: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe: @@ -2252,34 +2059,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-DPP-NEXT: v_max_f32_e32 v6, v4, v4 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_load_b32 v5, v0, s[0:1] -; GFX1164-DPP-NEXT: .LBB3_2: ; %atomicrmw.start -; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_min_f32_e32 v4, v4, v6 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB3_2 -; GFX1164-DPP-NEXT: .LBB3_3: +; GFX1164-DPP-NEXT: global_atomic_min_f32 v4, v0, s[0:1] +; GFX1164-DPP-NEXT: .LBB3_2: ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe: @@ -2335,34 +2126,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v1 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB3_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-DPP-NEXT: v_max_f32_e32 v6, v4, v4 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_load_b32 v5, v0, s[0:1] -; GFX1132-DPP-NEXT: .LBB3_2: ; %atomicrmw.start -; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_min_f32_e32 v4, v4, v6 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB3_2 -; GFX1132-DPP-NEXT: .LBB3_3: +; GFX1132-DPP-NEXT: global_atomic_min_f32 v4, v0, s[0:1] +; GFX1132-DPP-NEXT: .LBB3_2: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call float @div.float.value() %result = atomicrmw fmin ptr addrspace(1) %ptr, float %divValue syncscope("one-as") monotonic @@ -2439,55 +2214,29 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB4_3 +; GFX1064-NEXT: s_cbranch_execz .LBB4_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1064-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1064-NEXT: .LBB4_3: +; GFX1064-NEXT: global_atomic_fmin v0, v1, s[0:1] +; GFX1064-NEXT: .LBB4_2: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB4_3 +; GFX1032-NEXT: s_cbranch_execz .LBB4_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 -; GFX1032-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1032-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1032-NEXT: .LBB4_3: +; GFX1032-NEXT: global_atomic_fmin v0, v1, s[0:1] +; GFX1032-NEXT: .LBB4_2: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe: @@ -2497,60 +2246,33 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB4_3 +; GFX1164-NEXT: s_cbranch_execz .LBB4_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1164-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1164-NEXT: .LBB4_3: +; GFX1164-NEXT: global_atomic_min_f32 v0, v1, s[0:1] +; GFX1164-NEXT: .LBB4_2: +; GFX1164-NEXT: s_nop 0 +; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB4_3 +; GFX1132-NEXT: s_cbranch_execz .LBB4_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s2 -; GFX1132-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1132-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX1132-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1132-NEXT: .LBB4_3: +; GFX1132-NEXT: global_atomic_min_f32 v0, v1, s[0:1] +; GFX1132-NEXT: .LBB4_2: +; GFX1132-NEXT: s_nop 0 +; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm ; ; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe: @@ -2621,55 +2343,29 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_2 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1064-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1064-DPP-NEXT: .LBB4_3: +; GFX1064-DPP-NEXT: global_atomic_fmin v0, v1, s[0:1] +; GFX1064-DPP-NEXT: .LBB4_2: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_2 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1032-DPP-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1032-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1032-DPP-NEXT: .LBB4_3: +; GFX1032-DPP-NEXT: global_atomic_fmin v0, v1, s[0:1] +; GFX1032-DPP-NEXT: .LBB4_2: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe: @@ -2679,60 +2375,33 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB4_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1164-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1164-DPP-NEXT: .LBB4_3: +; GFX1164-DPP-NEXT: global_atomic_min_f32 v0, v1, s[0:1] +; GFX1164-DPP-NEXT: .LBB4_2: +; GFX1164-DPP-NEXT: s_nop 0 +; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1132-DPP-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1132-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1132-DPP-NEXT: .LBB4_3: +; GFX1132-DPP-NEXT: global_atomic_min_f32 v0, v1, s[0:1] +; GFX1132-DPP-NEXT: .LBB4_2: +; GFX1132-DPP-NEXT: s_nop 0 +; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 monotonic, align 4 ret void @@ -2913,18 +2582,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1064-NEXT: s_mov_b32 s32, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX1064-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: v_min_f32_e32 v2, v1, v2 +; GFX1064-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2932,27 +2601,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB5_5 +; GFX1064-NEXT: s_cbranch_execz .LBB5_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dword v1, v3, s[0:1] -; GFX1064-NEXT: .LBB5_4: ; %atomicrmw.start -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1064-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB5_4 -; GFX1064-NEXT: .LBB5_5: +; GFX1064-NEXT: global_atomic_fmin v0, v1, s[0:1] +; GFX1064-NEXT: .LBB5_4: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe: @@ -2983,45 +2638,31 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1032-NEXT: s_mov_b32 s32, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 -; GFX1032-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX1032-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2 -; GFX1032-NEXT: v_min_f32_e32 v2, v1, v2 +; GFX1032-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB5_5 +; GFX1032-NEXT: s_cbranch_execz .LBB5_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dword v1, v3, s[0:1] -; GFX1032-NEXT: .LBB5_4: ; %atomicrmw.start -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1032-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB5_4 -; GFX1032-NEXT: .LBB5_5: +; GFX1032-NEXT: global_atomic_fmin v0, v1, s[0:1] +; GFX1032-NEXT: .LBB5_4: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe: @@ -3042,13 +2683,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1164-NEXT: s_mov_b32 s32, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX1164-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX1164-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3 @@ -3056,7 +2697,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: v_min_f32_e32 v2, v1, v2 +; GFX1164-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3065,29 +2706,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB5_5 +; GFX1164-NEXT: s_cbranch_execz .LBB5_4 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: global_load_b32 v1, v3, s[0:1] -; GFX1164-NEXT: .LBB5_4: ; %atomicrmw.start -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB5_4 -; GFX1164-NEXT: .LBB5_5: +; GFX1164-NEXT: global_atomic_min_f32 v0, v1, s[0:1] +; GFX1164-NEXT: .LBB5_4: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe: @@ -3108,13 +2733,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX1132-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX1132-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 -; GFX1132-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -3122,36 +2747,21 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2 -; GFX1132-NEXT: v_min_f32_e32 v2, v1, v2 +; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, 0 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB5_5 +; GFX1132-NEXT: s_cbranch_execz .LBB5_4 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_max_f32 v2, v2, v2 +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: global_load_b32 v1, v3, s[0:1] -; GFX1132-NEXT: .LBB5_4: ; %atomicrmw.start -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB5_4 -; GFX1132-NEXT: .LBB5_5: +; GFX1132-NEXT: global_atomic_min_f32 v0, v1, s[0:1] +; GFX1132-NEXT: .LBB5_4: ; GFX1132-NEXT: s_endpgm ; ; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe: @@ -3370,27 +2980,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB5_2 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-DPP-NEXT: v_max_f32_e32 v6, v0, v0 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: global_load_dword v1, v2, s[0:1] -; GFX1064-DPP-NEXT: .LBB5_2: ; %atomicrmw.start -; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1064-DPP-NEXT: v_min_f32_e32 v0, v0, v6 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB5_2 -; GFX1064-DPP-NEXT: .LBB5_3: +; GFX1064-DPP-NEXT: global_atomic_fmin v1, v0, s[0:1] +; GFX1064-DPP-NEXT: .LBB5_2: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe: @@ -3452,29 +3048,15 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB5_2 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-DPP-NEXT: v_max_f32_e32 v6, v0, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: global_load_dword v1, v2, s[0:1] -; GFX1032-DPP-NEXT: .LBB5_2: ; %atomicrmw.start -; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1032-DPP-NEXT: v_min_f32_e32 v0, v0, v6 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB5_2 -; GFX1032-DPP-NEXT: .LBB5_3: +; GFX1032-DPP-NEXT: global_atomic_fmin v1, v0, s[0:1] +; GFX1032-DPP-NEXT: .LBB5_2: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe: @@ -3542,34 +3124,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-DPP-NEXT: v_max_f32_e32 v6, v4, v4 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_load_b32 v5, v0, s[0:1] -; GFX1164-DPP-NEXT: .LBB5_2: ; %atomicrmw.start -; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_min_f32_e32 v4, v4, v6 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB5_2 -; GFX1164-DPP-NEXT: .LBB5_3: +; GFX1164-DPP-NEXT: global_atomic_min_f32 v4, v0, s[0:1] +; GFX1164-DPP-NEXT: .LBB5_2: ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe: @@ -3625,34 +3191,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v1 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB5_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-DPP-NEXT: v_max_f32_e32 v6, v4, v4 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_load_b32 v5, v0, s[0:1] -; GFX1132-DPP-NEXT: .LBB5_2: ; %atomicrmw.start -; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_min_f32_e32 v4, v4, v6 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB5_2 -; GFX1132-DPP-NEXT: .LBB5_3: +; GFX1132-DPP-NEXT: global_atomic_min_f32 v4, v0, s[0:1] +; GFX1132-DPP-NEXT: .LBB5_2: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call float @div.float.value() %result = atomicrmw fmin ptr addrspace(1) %ptr, float %divValue monotonic, align 4 @@ -6003,59 +5553,31 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB8_3 +; GFX1064-NEXT: s_cbranch_execz .LBB8_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v2, s2 -; GFX1064-NEXT: v_mov_b32_e32 v3, s3 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB8_2: ; %atomicrmw.start -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 -; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1064-NEXT: v_mov_b32_e32 v3, v1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB8_2 -; GFX1064-NEXT: .LBB8_3: +; GFX1064-NEXT: global_atomic_fmin_x2 v2, v[0:1], s[0:1] +; GFX1064-NEXT: .LBB8_2: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB8_3 +; GFX1032-NEXT: s_cbranch_execz .LBB8_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v2, s2 -; GFX1032-NEXT: v_mov_b32_e32 v3, s3 -; GFX1032-NEXT: .LBB8_2: ; %atomicrmw.start -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 -; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1032-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: s_cbranch_execnz .LBB8_2 -; GFX1032-NEXT: .LBB8_3: +; GFX1032-NEXT: global_atomic_fmin_x2 v2, v[0:1], s[0:1] +; GFX1032-NEXT: .LBB8_2: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: @@ -6197,59 +5719,31 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB8_2 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB8_2: ; %atomicrmw.start -; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB8_2 -; GFX1064-DPP-NEXT: .LBB8_3: +; GFX1064-DPP-NEXT: global_atomic_fmin_x2 v2, v[0:1], s[0:1] +; GFX1064-DPP-NEXT: .LBB8_2: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB8_2 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s2 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s3 -; GFX1032-DPP-NEXT: .LBB8_2: ; %atomicrmw.start -; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB8_2 -; GFX1032-DPP-NEXT: .LBB8_3: +; GFX1032-DPP-NEXT: global_atomic_fmin_x2 v2, v[0:1], s[0:1] +; GFX1032-DPP-NEXT: .LBB8_2: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: @@ -6503,20 +5997,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1064-NEXT: s_mov_b32 s32, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: .LBB9_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX1064-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 ; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 ; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] +; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6524,28 +6018,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB9_5 +; GFX1064-NEXT: s_cbranch_execz .LBB9_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX1064-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] -; GFX1064-NEXT: .LBB9_4: ; %atomicrmw.start -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1064-NEXT: v_mov_b32_e32 v3, v1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB9_4 -; GFX1064-NEXT: .LBB9_5: +; GFX1064-NEXT: global_atomic_fmin_x2 v0, v[2:3], s[0:1] +; GFX1064-NEXT: .LBB9_4: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: @@ -6576,48 +6055,33 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1032-NEXT: s_mov_b32 s32, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: .LBB9_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 -; GFX1032-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1032-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] +; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1032-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB9_5 +; GFX1032-NEXT: s_cbranch_execz .LBB9_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX1032-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] -; GFX1032-NEXT: .LBB9_4: ; %atomicrmw.start -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1032-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB9_4 -; GFX1032-NEXT: .LBB9_5: +; GFX1032-NEXT: global_atomic_fmin_x2 v0, v[2:3], s[0:1] +; GFX1032-NEXT: .LBB9_4: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: @@ -7008,28 +6472,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_2 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] -; GFX1064-DPP-NEXT: .LBB9_2: ; %atomicrmw.start -; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_max_f64 v[9:10], v[0:1], v[0:1] -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_max_f64 v[13:14], v[11:12], v[11:12] -; GFX1064-DPP-NEXT: v_min_f64 v[9:10], v[13:14], v[9:10] -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_2 -; GFX1064-DPP-NEXT: .LBB9_3: +; GFX1064-DPP-NEXT: global_atomic_fmin_x2 v2, v[0:1], s[0:1] +; GFX1064-DPP-NEXT: .LBB9_2: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: @@ -7104,30 +6553,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_2 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] -; GFX1032-DPP-NEXT: .LBB9_2: ; %atomicrmw.start -; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_max_f64 v[9:10], v[11:12], v[11:12] -; GFX1032-DPP-NEXT: v_min_f64 v[9:10], v[9:10], v[0:1] -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_2 -; GFX1032-DPP-NEXT: .LBB9_3: +; GFX1032-DPP-NEXT: global_atomic_fmin_x2 v2, v[0:1], s[0:1] +; GFX1032-DPP-NEXT: .LBB9_2: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: @@ -9680,55 +9114,29 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB12_3 +; GFX1064-NEXT: s_cbranch_execz .LBB12_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB12_2: ; %atomicrmw.start -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1064-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB12_2 -; GFX1064-NEXT: .LBB12_3: +; GFX1064-NEXT: global_atomic_fmin v0, v1, s[0:1] +; GFX1064-NEXT: .LBB12_2: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB12_3 +; GFX1032-NEXT: s_cbranch_execz .LBB12_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 -; GFX1032-NEXT: .LBB12_2: ; %atomicrmw.start -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1032-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: s_cbranch_execnz .LBB12_2 -; GFX1032-NEXT: .LBB12_3: +; GFX1032-NEXT: global_atomic_fmin v0, v1, s[0:1] +; GFX1032-NEXT: .LBB12_2: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -9738,60 +9146,33 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB12_3 +; GFX1164-NEXT: s_cbranch_execz .LBB12_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB12_2: ; %atomicrmw.start -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1164-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB12_2 -; GFX1164-NEXT: .LBB12_3: +; GFX1164-NEXT: global_atomic_min_f32 v0, v1, s[0:1] +; GFX1164-NEXT: .LBB12_2: +; GFX1164-NEXT: s_nop 0 +; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB12_3 +; GFX1132-NEXT: s_cbranch_execz .LBB12_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s2 -; GFX1132-NEXT: .LBB12_2: ; %atomicrmw.start -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1132-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX1132-NEXT: s_cbranch_execnz .LBB12_2 -; GFX1132-NEXT: .LBB12_3: +; GFX1132-NEXT: global_atomic_min_f32 v0, v1, s[0:1] +; GFX1132-NEXT: .LBB12_2: +; GFX1132-NEXT: s_nop 0 +; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm ; ; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -9862,55 +9243,29 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB12_2 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB12_2: ; %atomicrmw.start -; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1064-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB12_2 -; GFX1064-DPP-NEXT: .LBB12_3: +; GFX1064-DPP-NEXT: global_atomic_fmin v0, v1, s[0:1] +; GFX1064-DPP-NEXT: .LBB12_2: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB12_2 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1032-DPP-NEXT: .LBB12_2: ; %atomicrmw.start -; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1032-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_2 -; GFX1032-DPP-NEXT: .LBB12_3: +; GFX1032-DPP-NEXT: global_atomic_fmin v0, v1, s[0:1] +; GFX1032-DPP-NEXT: .LBB12_2: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -9920,60 +9275,33 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB12_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: .LBB12_2: ; %atomicrmw.start -; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1164-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB12_2 -; GFX1164-DPP-NEXT: .LBB12_3: +; GFX1164-DPP-NEXT: global_atomic_min_f32 v0, v1, s[0:1] +; GFX1164-DPP-NEXT: .LBB12_2: +; GFX1164-DPP-NEXT: s_nop 0 +; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1132-DPP-NEXT: .LBB12_2: ; %atomicrmw.start -; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1132-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_2 -; GFX1132-DPP-NEXT: .LBB12_3: +; GFX1132-DPP-NEXT: global_atomic_min_f32 v0, v1, s[0:1] +; GFX1132-DPP-NEXT: .LBB12_2: +; GFX1132-DPP-NEXT: s_nop 0 +; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory !1, !amdgpu.ignore.denormal.mode !1 ret void @@ -10048,55 +9376,29 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB13_3 +; GFX1064-NEXT: s_cbranch_execz .LBB13_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB13_2: ; %atomicrmw.start -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1064-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB13_2 -; GFX1064-NEXT: .LBB13_3: +; GFX1064-NEXT: global_atomic_fmin v0, v1, s[0:1] +; GFX1064-NEXT: .LBB13_2: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB13_3 +; GFX1032-NEXT: s_cbranch_execz .LBB13_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 -; GFX1032-NEXT: .LBB13_2: ; %atomicrmw.start -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1032-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: s_cbranch_execnz .LBB13_2 -; GFX1032-NEXT: .LBB13_3: +; GFX1032-NEXT: global_atomic_fmin v0, v1, s[0:1] +; GFX1032-NEXT: .LBB13_2: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -10106,60 +9408,33 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB13_3 +; GFX1164-NEXT: s_cbranch_execz .LBB13_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB13_2: ; %atomicrmw.start -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1164-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB13_2 -; GFX1164-NEXT: .LBB13_3: +; GFX1164-NEXT: global_atomic_min_f32 v0, v1, s[0:1] +; GFX1164-NEXT: .LBB13_2: +; GFX1164-NEXT: s_nop 0 +; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB13_3 +; GFX1132-NEXT: s_cbranch_execz .LBB13_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s2 -; GFX1132-NEXT: .LBB13_2: ; %atomicrmw.start -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1132-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX1132-NEXT: s_cbranch_execnz .LBB13_2 -; GFX1132-NEXT: .LBB13_3: +; GFX1132-NEXT: global_atomic_min_f32 v0, v1, s[0:1] +; GFX1132-NEXT: .LBB13_2: +; GFX1132-NEXT: s_nop 0 +; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm ; ; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -10230,55 +9505,29 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_2 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB13_2: ; %atomicrmw.start -; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1064-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB13_2 -; GFX1064-DPP-NEXT: .LBB13_3: +; GFX1064-DPP-NEXT: global_atomic_fmin v0, v1, s[0:1] +; GFX1064-DPP-NEXT: .LBB13_2: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_2 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1032-DPP-NEXT: .LBB13_2: ; %atomicrmw.start -; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1032-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB13_2 -; GFX1032-DPP-NEXT: .LBB13_3: +; GFX1032-DPP-NEXT: global_atomic_fmin v0, v1, s[0:1] +; GFX1032-DPP-NEXT: .LBB13_2: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -10288,60 +9537,33 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB13_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: .LBB13_2: ; %atomicrmw.start -; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1164-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB13_2 -; GFX1164-DPP-NEXT: .LBB13_3: +; GFX1164-DPP-NEXT: global_atomic_min_f32 v0, v1, s[0:1] +; GFX1164-DPP-NEXT: .LBB13_2: +; GFX1164-DPP-NEXT: s_nop 0 +; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB13_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1132-DPP-NEXT: .LBB13_2: ; %atomicrmw.start -; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1132-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB13_2 -; GFX1132-DPP-NEXT: .LBB13_3: +; GFX1132-DPP-NEXT: global_atomic_min_f32 v0, v1, s[0:1] +; GFX1132-DPP-NEXT: .LBB13_2: +; GFX1132-DPP-NEXT: s_nop 0 +; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory !1 ret void diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll index 7290a91c9ccd2..b0d2824a64ee3 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll @@ -3345,176 +3345,936 @@ define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_ignore_denormal_mode_ ;--------------------------------------------------------------------- define float @test_atomicrmw_fmax_f32_global_agent(ptr addrspace(1) %ptr, float %value) { -; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_agent( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) -; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 -; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret float [[TMP6]] +; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_agent( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP6]] +; +; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_agent( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP6]] +; +; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_agent( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP6]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_agent( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP6]] +; +; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_agent( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: +; GFX940-NEXT: ret float [[TMP6]] +; +; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_agent( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX10-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret float [[TMP6]] +; +; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_agent( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret float [[TMP6]] +; +; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_agent( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4 +; GFX12-NEXT: ret float [[RES]] ; %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst ret float %res } define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) { -; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) -; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 -; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret float [[TMP6]] +; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP6]] +; +; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP6]] +; +; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP6]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP6]] +; +; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: +; GFX940-NEXT: ret float [[TMP6]] +; +; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] +; GFX10-NEXT: ret float [[RES]] +; +; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] +; GFX11-NEXT: ret float [[RES]] +; +; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] +; GFX12-NEXT: ret float [[RES]] ; %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %res } define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) { -; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) -; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 -; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret float [[TMP6]] +; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP6]] +; +; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP6]] +; +; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP6]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP6]] +; +; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: +; GFX940-NEXT: ret float [[TMP6]] +; +; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX10-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret float [[RES]] +; +; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX11-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret float [[RES]] +; +; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] +; GFX12-NEXT: ret float [[RES]] ; %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret float %res } define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) { -; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) -; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 -; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret float [[TMP6]] +; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP6]] +; +; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP6]] +; +; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP6]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP6]] +; +; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: +; GFX940-NEXT: ret float [[TMP6]] +; +; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX10-NEXT: ret float [[RES]] +; +; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX11-NEXT: ret float [[RES]] +; +; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX12-NEXT: ret float [[RES]] ; %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret float %res } define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) { -; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) -; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 -; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret float [[TMP6]] +; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP6]] +; +; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP6]] +; +; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP6]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP6]] +; +; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: +; GFX940-NEXT: ret float [[TMP6]] +; +; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX10-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret float [[TMP6]] +; +; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret float [[TMP6]] +; +; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret float [[RES]] ; %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0 ret float %res } define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) { -; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) -; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 -; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret float [[TMP6]] +; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP6]] +; +; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP6]] +; +; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP6]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP6]] +; +; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: +; GFX940-NEXT: ret float [[TMP6]] +; +; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX10-NEXT: ret float [[RES]] +; +; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX11-NEXT: ret float [[RES]] +; +; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret float [[RES]] ; %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret float %res } define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) { -; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) -; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 -; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret float [[TMP6]] +; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP6]] +; +; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP6]] +; +; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP6]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP6]] +; +; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: +; GFX940-NEXT: ret float [[TMP6]] +; +; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX10-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret float [[RES]] +; +; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX11-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret float [[RES]] +; +; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret float [[RES]] ; %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 ret float %res } define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) { -; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) -; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 -; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret float [[TMP6]] +; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP6]] +; +; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP6]] +; +; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP6]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP6]] +; +; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: +; GFX940-NEXT: ret float [[TMP6]] +; +; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX10-NEXT: ret float [[RES]] +; +; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX11-NEXT: ret float [[RES]] +; +; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret float [[RES]] ; %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 ret float %res @@ -3525,176 +4285,936 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_ ;--------------------------------------------------------------------- define float @test_atomicrmw_fmin_f32_global_agent(ptr addrspace(1) %ptr, float %value) { -; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_agent( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) -; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 -; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret float [[TMP6]] +; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_agent( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP6]] +; +; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_agent( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP6]] +; +; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_agent( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP6]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_agent( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP6]] +; +; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_agent( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: +; GFX940-NEXT: ret float [[TMP6]] +; +; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_agent( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX10-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret float [[TMP6]] +; +; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_agent( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret float [[TMP6]] +; +; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_agent( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4 +; GFX12-NEXT: ret float [[RES]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst + ret float %res +} + +define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) { +; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP6]] +; +; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP6]] +; +; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP6]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP6]] +; +; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: +; GFX940-NEXT: ret float [[TMP6]] +; +; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] +; GFX10-NEXT: ret float [[RES]] +; +; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] +; GFX11-NEXT: ret float [[RES]] +; +; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] +; GFX12-NEXT: ret float [[RES]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret float %res +} + +define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) { +; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP6]] +; +; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP6]] +; +; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP6]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP6]] +; +; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: +; GFX940-NEXT: ret float [[TMP6]] +; +; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX10-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret float [[RES]] ; - %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst - ret float %res -} - -define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) { -; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) -; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 -; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret float [[TMP6]] +; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX11-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret float [[RES]] ; - %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret float %res -} - -define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) { -; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) -; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 -; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret float [[TMP6]] +; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] +; GFX12-NEXT: ret float [[RES]] ; %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret float %res } define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) { -; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) -; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 -; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret float [[TMP6]] +; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP6]] +; +; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP6]] +; +; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP6]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP6]] +; +; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: +; GFX940-NEXT: ret float [[TMP6]] +; +; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX10-NEXT: ret float [[RES]] +; +; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX11-NEXT: ret float [[RES]] +; +; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX12-NEXT: ret float [[RES]] ; %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret float %res } define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) { -; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) -; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 -; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret float [[TMP6]] +; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP6]] +; +; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP6]] +; +; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP6]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP6]] +; +; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: +; GFX940-NEXT: ret float [[TMP6]] +; +; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX10-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret float [[TMP6]] +; +; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret float [[TMP6]] +; +; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret float [[RES]] ; %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0 ret float %res } define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) { -; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) -; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 -; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret float [[TMP6]] +; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP6]] +; +; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP6]] +; +; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP6]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP6]] +; +; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: +; GFX940-NEXT: ret float [[TMP6]] +; +; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX10-NEXT: ret float [[RES]] +; +; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX11-NEXT: ret float [[RES]] +; +; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret float [[RES]] ; %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret float %res } define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) { -; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) -; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 -; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret float [[TMP6]] +; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP6]] +; +; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP6]] +; +; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP6]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP6]] +; +; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: +; GFX940-NEXT: ret float [[TMP6]] +; +; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX10-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret float [[RES]] +; +; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX11-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret float [[RES]] +; +; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret float [[RES]] ; %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 ret float %res } define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) { -; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) -; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 -; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret float [[TMP6]] +; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP6]] +; +; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP6]] +; +; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP6]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP6]] +; +; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: +; GFX940-NEXT: ret float [[TMP6]] +; +; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX10-NEXT: ret float [[RES]] +; +; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX11-NEXT: ret float [[RES]] +; +; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret float [[RES]] ; %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 ret float %res diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll index 05fb224e6f145..c5bf26dc4c0e1 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll @@ -3367,66 +3367,339 @@ define float @test_atomicrmw_fmax_f32_global_system(ptr addrspace(1) %ptr, float } define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) { -; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) -; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 -; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret float [[TMP6]] +; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP6]] +; +; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP6]] +; +; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP6]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP6]] +; +; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: +; GFX940-NEXT: ret float [[TMP6]] +; +; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] +; GFX10-NEXT: ret float [[RES]] +; +; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] +; GFX11-NEXT: ret float [[RES]] +; +; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] +; GFX12-NEXT: ret float [[RES]] ; %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %res } define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) { -; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) -; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 -; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret float [[TMP6]] +; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP6]] +; +; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP6]] +; +; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP6]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP6]] +; +; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: +; GFX940-NEXT: ret float [[TMP6]] +; +; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX10-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret float [[RES]] +; +; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX11-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret float [[RES]] +; +; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] +; GFX12-NEXT: ret float [[RES]] ; %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.remote.memory !0 ret float %res } define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) { -; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) -; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 -; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret float [[TMP6]] +; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP6]] +; +; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP6]] +; +; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP6]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP6]] +; +; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: +; GFX940-NEXT: ret float [[TMP6]] +; +; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX10-NEXT: ret float [[RES]] +; +; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX11-NEXT: ret float [[RES]] +; +; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX12-NEXT: ret float [[RES]] ; %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret float %res @@ -3455,66 +3728,339 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode } define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) { -; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) -; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 -; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret float [[TMP6]] +; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP6]] +; +; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP6]] +; +; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP6]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP6]] +; +; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: +; GFX940-NEXT: ret float [[TMP6]] +; +; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX10-NEXT: ret float [[RES]] +; +; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX11-NEXT: ret float [[RES]] +; +; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret float [[RES]] ; %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret float %res } define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) { -; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) -; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 -; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret float [[TMP6]] +; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP6]] +; +; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP6]] +; +; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP6]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP6]] +; +; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: +; GFX940-NEXT: ret float [[TMP6]] +; +; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX10-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret float [[RES]] +; +; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX11-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret float [[RES]] +; +; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret float [[RES]] ; %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 ret float %res } define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) { -; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) -; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 -; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret float [[TMP6]] +; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP6]] +; +; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP6]] +; +; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP6]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP6]] +; +; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: +; GFX940-NEXT: ret float [[TMP6]] +; +; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX10-NEXT: ret float [[RES]] +; +; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX11-NEXT: ret float [[RES]] +; +; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret float [[RES]] ; %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 ret float %res @@ -3542,71 +4088,344 @@ define float @test_atomicrmw_fmin_f32_global_system(ptr addrspace(1) %ptr, float ; COMMON: atomicrmw.end: ; COMMON-NEXT: ret float [[TMP6]] ; - %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst - ret float %res -} - -define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) { -; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) -; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 -; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret float [[TMP6]] + %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst + ret float %res +} + +define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) { +; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP6]] +; +; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP6]] +; +; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP6]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP6]] +; +; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: +; GFX940-NEXT: ret float [[TMP6]] +; +; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] +; GFX10-NEXT: ret float [[RES]] +; +; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] +; GFX11-NEXT: ret float [[RES]] +; +; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] +; GFX12-NEXT: ret float [[RES]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret float %res +} + +define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) { +; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP6]] +; +; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP6]] +; +; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP6]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP6]] +; +; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: +; GFX940-NEXT: ret float [[TMP6]] +; +; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX10-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret float [[RES]] +; +; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX11-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret float [[RES]] ; - %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0 - ret float %res -} - -define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) { -; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) -; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 -; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret float [[TMP6]] +; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] +; GFX12-NEXT: ret float [[RES]] ; %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.remote.memory !0 ret float %res } define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) { -; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) -; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 -; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret float [[TMP6]] +; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP6]] +; +; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP6]] +; +; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP6]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP6]] +; +; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: +; GFX940-NEXT: ret float [[TMP6]] +; +; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX10-NEXT: ret float [[RES]] +; +; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX11-NEXT: ret float [[RES]] +; +; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX12-NEXT: ret float [[RES]] ; %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret float %res @@ -3635,66 +4454,339 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode } define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) { -; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) -; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 -; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret float [[TMP6]] +; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP6]] +; +; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP6]] +; +; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP6]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP6]] +; +; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: +; GFX940-NEXT: ret float [[TMP6]] +; +; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX10-NEXT: ret float [[RES]] +; +; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX11-NEXT: ret float [[RES]] +; +; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret float [[RES]] ; %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret float %res } define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) { -; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) -; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 -; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret float [[TMP6]] +; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP6]] +; +; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP6]] +; +; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP6]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP6]] +; +; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: +; GFX940-NEXT: ret float [[TMP6]] +; +; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX10-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret float [[RES]] +; +; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX11-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret float [[RES]] +; +; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret float [[RES]] ; %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 ret float %res } define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) { -; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) -; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 -; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret float [[TMP6]] +; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP6]] +; +; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP6]] +; +; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP6]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP6]] +; +; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: +; GFX940-NEXT: ret float [[TMP6]] +; +; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX10-NEXT: ret float [[RES]] +; +; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX11-NEXT: ret float [[RES]] +; +; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret float [[RES]] ; %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 ret float %res diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll index af6b7e0addfb1..ee360dee79425 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll @@ -1241,88 +1241,468 @@ define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_ignore_denormal_mode ;--------------------------------------------------------------------- define double @test_atomicrmw_fmax_f64_global_agent(ptr addrspace(1) %ptr, double %value) { -; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_agent( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) -; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 -; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 -; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret double [[TMP6]] +; GFX803-LABEL: define double @test_atomicrmw_fmax_f64_global_agent( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret double [[TMP6]] +; +; GFX906-LABEL: define double @test_atomicrmw_fmax_f64_global_agent( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret double [[TMP6]] +; +; GFX908-LABEL: define double @test_atomicrmw_fmax_f64_global_agent( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret double [[TMP6]] +; +; GFX90A-LABEL: define double @test_atomicrmw_fmax_f64_global_agent( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX90A-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX90A-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret double [[TMP6]] +; +; GFX940-LABEL: define double @test_atomicrmw_fmax_f64_global_agent( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8 +; GFX940-NEXT: ret double [[RES]] +; +; GFX10-LABEL: define double @test_atomicrmw_fmax_f64_global_agent( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX10-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX10-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret double [[TMP6]] +; +; GFX11-LABEL: define double @test_atomicrmw_fmax_f64_global_agent( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret double [[TMP6]] +; +; GFX12-LABEL: define double @test_atomicrmw_fmax_f64_global_agent( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret double [[TMP6]] ; %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst ret double %res } define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) { -; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) -; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 -; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 -; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret double [[TMP6]] +; GFX803-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret double [[TMP6]] +; +; GFX906-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret double [[TMP6]] +; +; GFX908-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret double [[TMP6]] +; +; GFX90A-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX90A-NEXT: ret double [[RES]] +; +; GFX940-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX940-NEXT: ret double [[RES]] +; +; GFX10-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX10-NEXT: ret double [[RES]] +; +; GFX11-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret double [[TMP6]] +; +; GFX12-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret double [[TMP6]] ; %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret double %res } define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) { -; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) -; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 -; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 -; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret double [[TMP6]] +; GFX803-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret double [[TMP6]] +; +; GFX906-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret double [[TMP6]] +; +; GFX908-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret double [[TMP6]] +; +; GFX90A-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX90A-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX90A-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret double [[RES]] +; +; GFX940-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] +; GFX940-NEXT: ret double [[RES]] +; +; GFX10-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX10-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX10-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret double [[RES]] +; +; GFX11-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret double [[TMP6]] +; +; GFX12-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret double [[TMP6]] ; %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret double %res } define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) { -; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) -; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 -; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 -; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret double [[TMP6]] +; GFX803-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret double [[TMP6]] +; +; GFX906-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret double [[TMP6]] +; +; GFX908-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret double [[TMP6]] +; +; GFX90A-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX90A-NEXT: ret double [[RES]] +; +; GFX940-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX940-NEXT: ret double [[RES]] +; +; GFX10-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX10-NEXT: ret double [[RES]] +; +; GFX11-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret double [[TMP6]] +; +; GFX12-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret double [[TMP6]] ; %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret double %res @@ -1457,88 +1837,468 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode ;--------------------------------------------------------------------- define double @test_atomicrmw_fmin_f64_global_agent(ptr addrspace(1) %ptr, double %value) { -; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_agent( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) -; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 -; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 -; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret double [[TMP6]] +; GFX803-LABEL: define double @test_atomicrmw_fmin_f64_global_agent( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret double [[TMP6]] +; +; GFX906-LABEL: define double @test_atomicrmw_fmin_f64_global_agent( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret double [[TMP6]] +; +; GFX908-LABEL: define double @test_atomicrmw_fmin_f64_global_agent( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret double [[TMP6]] +; +; GFX90A-LABEL: define double @test_atomicrmw_fmin_f64_global_agent( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX90A-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX90A-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret double [[TMP6]] +; +; GFX940-LABEL: define double @test_atomicrmw_fmin_f64_global_agent( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8 +; GFX940-NEXT: ret double [[RES]] +; +; GFX10-LABEL: define double @test_atomicrmw_fmin_f64_global_agent( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX10-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX10-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret double [[TMP6]] +; +; GFX11-LABEL: define double @test_atomicrmw_fmin_f64_global_agent( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret double [[TMP6]] +; +; GFX12-LABEL: define double @test_atomicrmw_fmin_f64_global_agent( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret double [[TMP6]] ; %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst ret double %res } define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) { -; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) -; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 -; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 -; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret double [[TMP6]] +; GFX803-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret double [[TMP6]] +; +; GFX906-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret double [[TMP6]] +; +; GFX908-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret double [[TMP6]] +; +; GFX90A-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX90A-NEXT: ret double [[RES]] +; +; GFX940-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX940-NEXT: ret double [[RES]] +; +; GFX10-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX10-NEXT: ret double [[RES]] +; +; GFX11-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret double [[TMP6]] +; +; GFX12-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret double [[TMP6]] ; %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret double %res } define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) { -; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) -; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 -; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 -; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret double [[TMP6]] +; GFX803-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret double [[TMP6]] +; +; GFX906-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret double [[TMP6]] +; +; GFX908-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret double [[TMP6]] +; +; GFX90A-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX90A-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX90A-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret double [[RES]] +; +; GFX940-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] +; GFX940-NEXT: ret double [[RES]] +; +; GFX10-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX10-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX10-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret double [[RES]] +; +; GFX11-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret double [[TMP6]] +; +; GFX12-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret double [[TMP6]] ; %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret double %res } define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) { -; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) -; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 -; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 -; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret double [[TMP6]] +; GFX803-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret double [[TMP6]] +; +; GFX906-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret double [[TMP6]] +; +; GFX908-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret double [[TMP6]] +; +; GFX90A-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX90A-NEXT: ret double [[RES]] +; +; GFX940-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX940-NEXT: ret double [[RES]] +; +; GFX10-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX10-NEXT: ret double [[RES]] +; +; GFX11-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret double [[TMP6]] +; +; GFX12-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret double [[TMP6]] ; %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret double %res diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll index 69d65e6f1f379..ac5dd55002f3f 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll @@ -1263,66 +1263,339 @@ define double @test_atomicrmw_fmax_f64_global_system(ptr addrspace(1) %ptr, doub } define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) { -; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) -; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 -; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 -; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret double [[TMP6]] +; GFX803-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret double [[TMP6]] +; +; GFX906-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret double [[TMP6]] +; +; GFX908-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret double [[TMP6]] +; +; GFX90A-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX90A-NEXT: ret double [[RES]] +; +; GFX940-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX940-NEXT: ret double [[RES]] +; +; GFX10-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX10-NEXT: ret double [[RES]] +; +; GFX11-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret double [[TMP6]] +; +; GFX12-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret double [[TMP6]] ; %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0 ret double %res } define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) { -; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) -; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 -; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 -; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret double [[TMP6]] +; GFX803-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret double [[TMP6]] +; +; GFX906-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret double [[TMP6]] +; +; GFX908-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret double [[TMP6]] +; +; GFX90A-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX90A-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX90A-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret double [[RES]] +; +; GFX940-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] +; GFX940-NEXT: ret double [[RES]] +; +; GFX10-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX10-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX10-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret double [[RES]] +; +; GFX11-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret double [[TMP6]] +; +; GFX12-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret double [[TMP6]] ; %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.remote.memory !0 ret double %res } define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) { -; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) -; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 -; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 -; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret double [[TMP6]] +; GFX803-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret double [[TMP6]] +; +; GFX906-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret double [[TMP6]] +; +; GFX908-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret double [[TMP6]] +; +; GFX90A-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX90A-NEXT: ret double [[RES]] +; +; GFX940-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX940-NEXT: ret double [[RES]] +; +; GFX10-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX10-NEXT: ret double [[RES]] +; +; GFX11-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret double [[TMP6]] +; +; GFX12-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret double [[TMP6]] ; %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret double %res @@ -1479,66 +1752,339 @@ define double @test_atomicrmw_fmin_f64_global_system(ptr addrspace(1) %ptr, doub } define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) { -; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) -; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 -; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 -; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret double [[TMP6]] +; GFX803-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret double [[TMP6]] +; +; GFX906-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret double [[TMP6]] +; +; GFX908-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret double [[TMP6]] +; +; GFX90A-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX90A-NEXT: ret double [[RES]] +; +; GFX940-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX940-NEXT: ret double [[RES]] +; +; GFX10-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX10-NEXT: ret double [[RES]] +; +; GFX11-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret double [[TMP6]] +; +; GFX12-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret double [[TMP6]] ; %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0 ret double %res } define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) { -; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) -; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 -; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 -; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret double [[TMP6]] +; GFX803-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret double [[TMP6]] +; +; GFX906-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret double [[TMP6]] +; +; GFX908-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret double [[TMP6]] +; +; GFX90A-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX90A-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX90A-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret double [[RES]] +; +; GFX940-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] +; GFX940-NEXT: ret double [[RES]] +; +; GFX10-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX10-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX10-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret double [[RES]] +; +; GFX11-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret double [[TMP6]] +; +; GFX12-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret double [[TMP6]] ; %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.remote.memory !0 ret double %res } define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) { -; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) -; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 -; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 -; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret double [[TMP6]] +; GFX803-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret double [[TMP6]] +; +; GFX906-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret double [[TMP6]] +; +; GFX908-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret double [[TMP6]] +; +; GFX90A-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX90A-NEXT: ret double [[RES]] +; +; GFX940-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX940-NEXT: ret double [[RES]] +; +; GFX10-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX10-NEXT: ret double [[RES]] +; +; GFX11-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret double [[TMP6]] +; +; GFX12-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret double [[TMP6]] ; %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret double %res