diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 1957e442dbabb..7575510dd7f98 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -17541,9 +17541,11 @@ void SITargetLowering::emitExpandAtomicAddrSpacePredicate( // where we only insert a check for private and still use the flat instruction // for global and shared. - bool FullFlatEmulation = RMW && RMW->getOperation() == AtomicRMWInst::FAdd && - Subtarget->hasAtomicFaddInsts() && - RMW->getType()->isFloatTy(); + bool FullFlatEmulation = + RMW && RMW->getOperation() == AtomicRMWInst::FAdd && + ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) || + (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && + RMW->getType()->isDoubleTy())); // If the return value isn't used, do not introduce a false use in the phi. bool ReturnValueIsUsed = !AI->use_empty(); diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll index 1669909e96eb1..231f53d7f3710 100644 --- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll +++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll @@ -799,6 +799,31 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 { ; GFX90A-LABEL: optnone_atomicrmw_fadd_f64_expand: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX90A-NEXT: s_mov_b32 s6, 32 +; GFX90A-NEXT: s_lshr_b64 s[4:5], s[4:5], s6 +; GFX90A-NEXT: s_getpc_b64 s[6:7] +; GFX90A-NEXT: s_add_u32 s6, s6, global@rel32@lo+4 +; GFX90A-NEXT: s_addc_u32 s7, s7, global@rel32@hi+12 +; GFX90A-NEXT: s_cmp_eq_u32 s7, s4 +; GFX90A-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; GFX90A-NEXT: s_mov_b64 s[4:5], -1 +; GFX90A-NEXT: s_mov_b32 s6, 1 +; GFX90A-NEXT: v_cmp_ne_u32_e64 s[6:7], v2, s6 +; GFX90A-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_cbranch_vccnz .LBB5_3 +; GFX90A-NEXT: .LBB5_1: ; %Flow4 +; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; GFX90A-NEXT: s_mov_b32 s4, 1 +; GFX90A-NEXT: v_cmp_ne_u32_e64 s[4:5], v4, s4 +; GFX90A-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_vccnz .LBB5_10 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.shared +; GFX90A-NEXT: ds_add_rtn_f64 v[2:3], v0, v[0:1] +; GFX90A-NEXT: s_branch .LBB5_10 +; GFX90A-NEXT: .LBB5_3: ; %atomicrmw.check.private ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: s_mov_b32 s6, 32 ; GFX90A-NEXT: s_lshr_b64 s[4:5], s[4:5], s6 @@ -813,50 +838,54 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 { ; GFX90A-NEXT: v_cmp_ne_u32_e64 s[6:7], v2, s6 ; GFX90A-NEXT: s_and_b64 vcc, exec, s[6:7] ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX90A-NEXT: s_cbranch_vccnz .LBB5_2 -; GFX90A-NEXT: s_branch .LBB5_3 -; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.private +; GFX90A-NEXT: s_cbranch_vccnz .LBB5_5 +; GFX90A-NEXT: s_branch .LBB5_6 +; GFX90A-NEXT: .LBB5_4: ; %atomicrmw.private ; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1] -; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v0, v0, s[0:3], 0 offen -; GFX90A-NEXT: s_branch .LBB5_6 -; GFX90A-NEXT: .LBB5_2: ; %atomicrmw.global +; GFX90A-NEXT: v_add_f64 v[4:5], v[2:3], v[0:1] +; GFX90A-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen +; GFX90A-NEXT: s_branch .LBB5_9 +; GFX90A-NEXT: .LBB5_5: ; %atomicrmw.global +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_getpc_b64 s[4:5] ; GFX90A-NEXT: s_add_u32 s4, s4, global@rel32@lo+4 ; GFX90A-NEXT: s_addc_u32 s5, s5, global@rel32@hi+12 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[2:3] +; GFX90A-NEXT: global_load_dwordx2 v[2:3], v2, s[4:5] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: s_branch .LBB5_4 -; GFX90A-NEXT: .LBB5_3: ; %Flow +; GFX90A-NEXT: s_branch .LBB5_7 +; GFX90A-NEXT: .LBB5_6: ; %Flow ; GFX90A-NEXT: s_and_b64 vcc, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_vccnz .LBB5_1 -; GFX90A-NEXT: s_branch .LBB5_6 -; GFX90A-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX90A-NEXT: s_cbranch_vccnz .LBB5_4 +; GFX90A-NEXT: s_branch .LBB5_9 +; GFX90A-NEXT: .LBB5_7: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] ; GFX90A-NEXT: s_getpc_b64 s[6:7] ; GFX90A-NEXT: s_add_u32 s6, s6, global@rel32@lo+4 ; GFX90A-NEXT: s_addc_u32 s7, s7, global@rel32@hi+12 -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[6:7], v[2:5] glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v6, v[2:5], s[6:7] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e64 s[6:7], v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB5_4 -; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.end1 +; GFX90A-NEXT: s_cbranch_execnz .LBB5_7 +; GFX90A-NEXT: ; %bb.8: ; %atomicrmw.end1 ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: s_branch .LBB5_3 -; GFX90A-NEXT: .LBB5_6: ; %atomicrmw.phi -; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.end +; GFX90A-NEXT: s_branch .LBB5_6 +; GFX90A-NEXT: .LBB5_9: ; %Flow3 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_branch .LBB5_1 +; GFX90A-NEXT: .LBB5_10: ; %atomicrmw.phi +; GFX90A-NEXT: ; %bb.11: ; %atomicrmw.end ; GFX90A-NEXT: s_mov_b32 s4, 32 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b64 v[4:5], s4, v[2:3] ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 @@ -866,6 +895,31 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 { ; GFX942-LABEL: optnone_atomicrmw_fadd_f64_expand: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], src_shared_base +; GFX942-NEXT: s_mov_b32 s2, 32 +; GFX942-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX942-NEXT: s_getpc_b64 s[2:3] +; GFX942-NEXT: s_add_u32 s2, s2, global@rel32@lo+4 +; GFX942-NEXT: s_addc_u32 s3, s3, global@rel32@hi+12 +; GFX942-NEXT: s_cmp_eq_u32 s3, s0 +; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX942-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], -1 +; GFX942-NEXT: s_mov_b32 s2, 1 +; GFX942-NEXT: v_cmp_ne_u32_e64 s[2:3], v2, s2 +; GFX942-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: s_cbranch_vccnz .LBB5_3 +; GFX942-NEXT: .LBB5_1: ; %Flow4 +; GFX942-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] +; GFX942-NEXT: s_mov_b32 s0, 1 +; GFX942-NEXT: v_cmp_ne_u32_e64 s[0:1], v4, s0 +; GFX942-NEXT: s_and_b64 vcc, exec, s[0:1] +; GFX942-NEXT: s_cbranch_vccnz .LBB5_10 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.shared +; GFX942-NEXT: ds_add_rtn_f64 v[2:3], v0, v[0:1] +; GFX942-NEXT: s_branch .LBB5_10 +; GFX942-NEXT: .LBB5_3: ; %atomicrmw.check.private ; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX942-NEXT: s_mov_b32 s2, 32 ; GFX942-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 @@ -880,48 +934,52 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 { ; GFX942-NEXT: v_cmp_ne_u32_e64 s[2:3], v2, s2 ; GFX942-NEXT: s_and_b64 vcc, exec, s[2:3] ; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX942-NEXT: s_cbranch_vccnz .LBB5_2 -; GFX942-NEXT: s_branch .LBB5_3 -; GFX942-NEXT: .LBB5_1: ; %atomicrmw.private +; GFX942-NEXT: s_cbranch_vccnz .LBB5_5 +; GFX942-NEXT: s_branch .LBB5_6 +; GFX942-NEXT: .LBB5_4: ; %atomicrmw.private ; GFX942-NEXT: scratch_load_dwordx2 v[2:3], off, s0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1] -; GFX942-NEXT: scratch_store_dwordx2 off, v[0:1], s0 -; GFX942-NEXT: s_branch .LBB5_6 -; GFX942-NEXT: .LBB5_2: ; %atomicrmw.global +; GFX942-NEXT: v_add_f64 v[4:5], v[2:3], v[0:1] +; GFX942-NEXT: scratch_store_dwordx2 off, v[4:5], s0 +; GFX942-NEXT: s_branch .LBB5_9 +; GFX942-NEXT: .LBB5_5: ; %atomicrmw.global +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: s_getpc_b64 s[0:1] ; GFX942-NEXT: s_add_u32 s0, s0, global@rel32@lo+4 ; GFX942-NEXT: s_addc_u32 s1, s1, global@rel32@hi+12 -; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX942-NEXT: flat_load_dwordx2 v[2:3], v[2:3] +; GFX942-NEXT: global_load_dwordx2 v[2:3], v2, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: s_branch .LBB5_4 -; GFX942-NEXT: .LBB5_3: ; %Flow +; GFX942-NEXT: s_branch .LBB5_7 +; GFX942-NEXT: .LBB5_6: ; %Flow ; GFX942-NEXT: s_and_b64 vcc, exec, s[0:1] -; GFX942-NEXT: s_cbranch_vccnz .LBB5_1 -; GFX942-NEXT: s_branch .LBB5_6 -; GFX942-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX942-NEXT: s_cbranch_vccnz .LBB5_4 +; GFX942-NEXT: s_branch .LBB5_9 +; GFX942-NEXT: .LBB5_7: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX942-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] ; GFX942-NEXT: s_getpc_b64 s[2:3] ; GFX942-NEXT: s_add_u32 s2, s2, global@rel32@lo+4 ; GFX942-NEXT: s_addc_u32 s3, s3, global@rel32@hi+12 -; GFX942-NEXT: v_mov_b64_e32 v[6:7], s[2:3] -; GFX942-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[6:7], v[2:5] sc0 sc1 -; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: global_atomic_cmpswap_x2 v[2:3], v6, v[2:5], s[2:3] sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_cmp_eq_u64_e64 s[2:3], v[2:3], v[4:5] ; GFX942-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX942-NEXT: s_cbranch_execnz .LBB5_4 -; GFX942-NEXT: ; %bb.5: ; %atomicrmw.end1 +; GFX942-NEXT: s_cbranch_execnz .LBB5_7 +; GFX942-NEXT: ; %bb.8: ; %atomicrmw.end1 ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: s_branch .LBB5_3 -; GFX942-NEXT: .LBB5_6: ; %atomicrmw.phi -; GFX942-NEXT: ; %bb.7: ; %atomicrmw.end +; GFX942-NEXT: s_branch .LBB5_6 +; GFX942-NEXT: .LBB5_9: ; %Flow3 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: s_branch .LBB5_1 +; GFX942-NEXT: .LBB5_10: ; %atomicrmw.phi +; GFX942-NEXT: ; %bb.11: ; %atomicrmw.end ; GFX942-NEXT: s_mov_b32 s0, 32 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b64 v[4:5], s0, v[2:3] ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v1, v4 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll index e13c895a1cc85..cfe4d24d427e7 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll @@ -5758,29 +5758,38 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: s_mov_b64 s[0:1], src_shared_base ; GFX942-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB30_3 -; GFX942-NEXT: ; %bb.1: ; %Flow +; GFX942-NEXT: ; %bb.1: ; %Flow2 ; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX942-NEXT: s_cbranch_execnz .LBB30_4 +; GFX942-NEXT: s_cbranch_execnz .LBB30_8 ; GFX942-NEXT: .LBB30_2: ; %atomicrmw.phi ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] -; GFX942-NEXT: .LBB30_3: ; %atomicrmw.global +; GFX942-NEXT: .LBB30_3: ; %atomicrmw.check.private +; GFX942-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s3, v5 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX942-NEXT: s_cbranch_execz .LBB30_5 +; GFX942-NEXT: ; %bb.4: ; %atomicrmw.global ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_add_f64 v[0:1], v[4:5], v[2:3] sc0 -; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: global_atomic_add_f64 v[0:1], v[4:5], v[2:3], off sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX942-NEXT: s_cbranch_execz .LBB30_2 -; GFX942-NEXT: .LBB30_4: ; %atomicrmw.private +; GFX942-NEXT: .LBB30_5: ; %Flow +; GFX942-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] +; GFX942-NEXT: s_cbranch_execz .LBB30_7 +; GFX942-NEXT: ; %bb.6: ; %atomicrmw.private ; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc @@ -5788,6 +5797,18 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] ; GFX942-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX942-NEXT: .LBB30_7: ; %Flow1 +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execz .LBB30_2 +; GFX942-NEXT: .LBB30_8: ; %atomicrmw.shared +; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc +; GFX942-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] +; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -5894,28 +5915,37 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB30_3 -; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: ; %bb.1: ; %Flow2 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB30_4 +; GFX90A-NEXT: s_cbranch_execnz .LBB30_8 ; GFX90A-NEXT: .LBB30_2: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] -; GFX90A-NEXT: .LBB30_3: ; %atomicrmw.global -; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[4:5], v[2:3] glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: .LBB30_3: ; %atomicrmw.check.private +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v5 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execz .LBB30_5 +; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[4:5], v[2:3], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execz .LBB30_2 -; GFX90A-NEXT: .LBB30_4: ; %atomicrmw.private +; GFX90A-NEXT: .LBB30_5: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] +; GFX90A-NEXT: s_cbranch_execz .LBB30_7 +; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -5924,6 +5954,17 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] ; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB30_7: ; %Flow1 +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB30_2 +; GFX90A-NEXT: .LBB30_8: ; %atomicrmw.shared +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc +; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -6160,28 +6201,37 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_mov_b64 s[0:1], 0x7f8 ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: s_mov_b64 s[0:1], src_shared_base ; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB31_3 -; GFX942-NEXT: ; %bb.1: ; %Flow +; GFX942-NEXT: ; %bb.1: ; %Flow2 ; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX942-NEXT: s_cbranch_execnz .LBB31_4 +; GFX942-NEXT: s_cbranch_execnz .LBB31_8 ; GFX942-NEXT: .LBB31_2: ; %atomicrmw.phi ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] -; GFX942-NEXT: .LBB31_3: ; %atomicrmw.global +; GFX942-NEXT: .LBB31_3: ; %atomicrmw.check.private +; GFX942-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s3, v5 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX942-NEXT: s_cbranch_execz .LBB31_5 +; GFX942-NEXT: ; %bb.4: ; %atomicrmw.global ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_add_f64 v[0:1], v[4:5], v[2:3] sc0 -; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: global_atomic_add_f64 v[0:1], v[4:5], v[2:3], off sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX942-NEXT: s_cbranch_execz .LBB31_2 -; GFX942-NEXT: .LBB31_4: ; %atomicrmw.private +; GFX942-NEXT: .LBB31_5: ; %Flow +; GFX942-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] +; GFX942-NEXT: s_cbranch_execz .LBB31_7 +; GFX942-NEXT: ; %bb.6: ; %atomicrmw.private ; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc @@ -6189,6 +6239,18 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] ; GFX942-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX942-NEXT: .LBB31_7: ; %Flow1 +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execz .LBB31_2 +; GFX942-NEXT: .LBB31_8: ; %atomicrmw.shared +; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc +; GFX942-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] +; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -6306,27 +6368,36 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7f8, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc -; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB31_3 -; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: ; %bb.1: ; %Flow2 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB31_4 +; GFX90A-NEXT: s_cbranch_execnz .LBB31_8 ; GFX90A-NEXT: .LBB31_2: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] -; GFX90A-NEXT: .LBB31_3: ; %atomicrmw.global -; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[4:5], v[2:3] glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: .LBB31_3: ; %atomicrmw.check.private +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v5 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execz .LBB31_5 +; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[4:5], v[2:3], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execz .LBB31_2 -; GFX90A-NEXT: .LBB31_4: ; %atomicrmw.private +; GFX90A-NEXT: .LBB31_5: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] +; GFX90A-NEXT: s_cbranch_execz .LBB31_7 +; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -6335,6 +6406,17 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] ; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB31_7: ; %Flow1 +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB31_2 +; GFX90A-NEXT: .LBB31_8: ; %atomicrmw.shared +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc +; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -6581,28 +6663,37 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_mov_b32 s1, -1 ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: s_mov_b64 s[0:1], src_shared_base ; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB32_3 -; GFX942-NEXT: ; %bb.1: ; %Flow +; GFX942-NEXT: ; %bb.1: ; %Flow2 ; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX942-NEXT: s_cbranch_execnz .LBB32_4 +; GFX942-NEXT: s_cbranch_execnz .LBB32_8 ; GFX942-NEXT: .LBB32_2: ; %atomicrmw.phi ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] -; GFX942-NEXT: .LBB32_3: ; %atomicrmw.global +; GFX942-NEXT: .LBB32_3: ; %atomicrmw.check.private +; GFX942-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s3, v5 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX942-NEXT: s_cbranch_execz .LBB32_5 +; GFX942-NEXT: ; %bb.4: ; %atomicrmw.global ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_add_f64 v[0:1], v[4:5], v[2:3] sc0 -; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: global_atomic_add_f64 v[0:1], v[4:5], v[2:3], off sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX942-NEXT: s_cbranch_execz .LBB32_2 -; GFX942-NEXT: .LBB32_4: ; %atomicrmw.private +; GFX942-NEXT: .LBB32_5: ; %Flow +; GFX942-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] +; GFX942-NEXT: s_cbranch_execz .LBB32_7 +; GFX942-NEXT: ; %bb.6: ; %atomicrmw.private ; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc @@ -6610,6 +6701,18 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] ; GFX942-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX942-NEXT: .LBB32_7: ; %Flow1 +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execz .LBB32_2 +; GFX942-NEXT: .LBB32_8: ; %atomicrmw.shared +; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc +; GFX942-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] +; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -6727,27 +6830,36 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB32_3 -; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: ; %bb.1: ; %Flow2 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB32_4 +; GFX90A-NEXT: s_cbranch_execnz .LBB32_8 ; GFX90A-NEXT: .LBB32_2: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] -; GFX90A-NEXT: .LBB32_3: ; %atomicrmw.global -; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[4:5], v[2:3] glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: .LBB32_3: ; %atomicrmw.check.private +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v5 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execz .LBB32_5 +; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[4:5], v[2:3], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execz .LBB32_2 -; GFX90A-NEXT: .LBB32_4: ; %atomicrmw.private +; GFX90A-NEXT: .LBB32_5: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] +; GFX90A-NEXT: s_cbranch_execz .LBB32_7 +; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -6756,6 +6868,17 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] ; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB32_7: ; %Flow1 +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB32_2 +; GFX90A-NEXT: .LBB32_8: ; %atomicrmw.shared +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc +; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -6994,27 +7117,35 @@ define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX942-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: s_mov_b64 s[0:1], src_shared_base ; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB33_3 -; GFX942-NEXT: ; %bb.1: ; %Flow +; GFX942-NEXT: ; %bb.1: ; %Flow2 ; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX942-NEXT: s_cbranch_execnz .LBB33_4 +; GFX942-NEXT: s_cbranch_execnz .LBB33_8 ; GFX942-NEXT: .LBB33_2: ; %atomicrmw.phi ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] -; GFX942-NEXT: .LBB33_3: ; %atomicrmw.global +; GFX942-NEXT: .LBB33_3: ; %atomicrmw.check.private +; GFX942-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s3, v1 +; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX942-NEXT: s_cbranch_execz .LBB33_5 +; GFX942-NEXT: ; %bb.4: ; %atomicrmw.global ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] -; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX942-NEXT: s_cbranch_execz .LBB33_2 -; GFX942-NEXT: .LBB33_4: ; %atomicrmw.private +; GFX942-NEXT: .LBB33_5: ; %Flow +; GFX942-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] +; GFX942-NEXT: s_cbranch_execz .LBB33_7 +; GFX942-NEXT: ; %bb.6: ; %atomicrmw.private ; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc @@ -7022,6 +7153,18 @@ define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX942-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX942-NEXT: .LBB33_7: ; %Flow1 +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execz .LBB33_2 +; GFX942-NEXT: .LBB33_8: ; %atomicrmw.shared +; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX942-NEXT: ds_add_f64 v0, v[2:3] +; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -7129,26 +7272,34 @@ define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB33_3 -; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: ; %bb.1: ; %Flow2 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB33_4 +; GFX90A-NEXT: s_cbranch_execnz .LBB33_8 ; GFX90A-NEXT: .LBB33_2: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] -; GFX90A-NEXT: .LBB33_3: ; %atomicrmw.global -; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: .LBB33_3: ; %atomicrmw.check.private +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execz .LBB33_5 +; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execz .LBB33_2 -; GFX90A-NEXT: .LBB33_4: ; %atomicrmw.private +; GFX90A-NEXT: .LBB33_5: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] +; GFX90A-NEXT: s_cbranch_execz .LBB33_7 +; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -7157,6 +7308,17 @@ define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB33_7: ; %Flow1 +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB33_2 +; GFX90A-NEXT: .LBB33_8: ; %atomicrmw.shared +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX90A-NEXT: ds_add_f64 v0, v[2:3] +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -7390,27 +7552,35 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_mov_b64 s[0:1], 0x7f8 ; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] -; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: s_mov_b64 s[0:1], src_shared_base ; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB34_3 -; GFX942-NEXT: ; %bb.1: ; %Flow +; GFX942-NEXT: ; %bb.1: ; %Flow2 ; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX942-NEXT: s_cbranch_execnz .LBB34_4 +; GFX942-NEXT: s_cbranch_execnz .LBB34_8 ; GFX942-NEXT: .LBB34_2: ; %atomicrmw.phi ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] -; GFX942-NEXT: .LBB34_3: ; %atomicrmw.global +; GFX942-NEXT: .LBB34_3: ; %atomicrmw.check.private +; GFX942-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s3, v1 +; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX942-NEXT: s_cbranch_execz .LBB34_5 +; GFX942-NEXT: ; %bb.4: ; %atomicrmw.global ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] -; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX942-NEXT: s_cbranch_execz .LBB34_2 -; GFX942-NEXT: .LBB34_4: ; %atomicrmw.private +; GFX942-NEXT: .LBB34_5: ; %Flow +; GFX942-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] +; GFX942-NEXT: s_cbranch_execz .LBB34_7 +; GFX942-NEXT: ; %bb.6: ; %atomicrmw.private ; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc @@ -7418,6 +7588,18 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX942-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX942-NEXT: .LBB34_7: ; %Flow1 +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execz .LBB34_2 +; GFX942-NEXT: .LBB34_8: ; %atomicrmw.shared +; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX942-NEXT: ds_add_f64 v0, v[2:3] +; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -7532,26 +7714,34 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x7f8, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB34_3 -; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: ; %bb.1: ; %Flow2 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB34_4 +; GFX90A-NEXT: s_cbranch_execnz .LBB34_8 ; GFX90A-NEXT: .LBB34_2: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] -; GFX90A-NEXT: .LBB34_3: ; %atomicrmw.global -; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: .LBB34_3: ; %atomicrmw.check.private +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execz .LBB34_5 +; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execz .LBB34_2 -; GFX90A-NEXT: .LBB34_4: ; %atomicrmw.private +; GFX90A-NEXT: .LBB34_5: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] +; GFX90A-NEXT: s_cbranch_execz .LBB34_7 +; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -7560,6 +7750,17 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB34_7: ; %Flow1 +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB34_2 +; GFX90A-NEXT: .LBB34_8: ; %atomicrmw.shared +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX90A-NEXT: ds_add_f64 v0, v[2:3] +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -7801,27 +8002,35 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_mov_b32 s1, -1 ; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] -; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: s_mov_b64 s[0:1], src_shared_base ; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB35_3 -; GFX942-NEXT: ; %bb.1: ; %Flow +; GFX942-NEXT: ; %bb.1: ; %Flow2 ; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX942-NEXT: s_cbranch_execnz .LBB35_4 +; GFX942-NEXT: s_cbranch_execnz .LBB35_8 ; GFX942-NEXT: .LBB35_2: ; %atomicrmw.phi ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] -; GFX942-NEXT: .LBB35_3: ; %atomicrmw.global +; GFX942-NEXT: .LBB35_3: ; %atomicrmw.check.private +; GFX942-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s3, v1 +; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX942-NEXT: s_cbranch_execz .LBB35_5 +; GFX942-NEXT: ; %bb.4: ; %atomicrmw.global ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] -; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX942-NEXT: s_cbranch_execz .LBB35_2 -; GFX942-NEXT: .LBB35_4: ; %atomicrmw.private +; GFX942-NEXT: .LBB35_5: ; %Flow +; GFX942-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] +; GFX942-NEXT: s_cbranch_execz .LBB35_7 +; GFX942-NEXT: ; %bb.6: ; %atomicrmw.private ; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc @@ -7829,6 +8038,18 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX942-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX942-NEXT: .LBB35_7: ; %Flow1 +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execz .LBB35_2 +; GFX942-NEXT: .LBB35_8: ; %atomicrmw.shared +; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX942-NEXT: ds_add_f64 v0, v[2:3] +; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -7943,26 +8164,34 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB35_3 -; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: ; %bb.1: ; %Flow2 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB35_4 +; GFX90A-NEXT: s_cbranch_execnz .LBB35_8 ; GFX90A-NEXT: .LBB35_2: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] -; GFX90A-NEXT: .LBB35_3: ; %atomicrmw.global -; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: .LBB35_3: ; %atomicrmw.check.private +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execz .LBB35_5 +; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execz .LBB35_2 -; GFX90A-NEXT: .LBB35_4: ; %atomicrmw.private +; GFX90A-NEXT: .LBB35_5: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] +; GFX90A-NEXT: s_cbranch_execz .LBB35_7 +; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -7971,6 +8200,17 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB35_7: ; %Flow1 +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB35_2 +; GFX90A-NEXT: .LBB35_8: ; %atomicrmw.shared +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX90A-NEXT: ds_add_f64 v0, v[2:3] +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll b/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll index f160cc96654fc..b24ebbd9435cf 100644 --- a/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll +++ b/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll @@ -18,35 +18,54 @@ define protected amdgpu_kernel void @IllegalGEPConst(i32 %a, ptr addrspace(1) %b ; CHECK-NEXT: s_add_u32 s0, s0, s6 ; CHECK-NEXT: s_addc_u32 s1, s1, s7 ; CHECK-NEXT: s_add_u32 s0, s0, -8 -; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base +; CHECK-NEXT: s_mov_b64 s[4:5], src_shared_base ; CHECK-NEXT: s_addc_u32 s1, s1, -1 ; CHECK-NEXT: s_cmp_eq_u32 s1, s5 ; CHECK-NEXT: s_cselect_b64 s[4:5], -1, 0 ; CHECK-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; CHECK-NEXT: s_mov_b64 s[4:5], -1 ; CHECK-NEXT: s_cbranch_vccnz .LBB0_3 -; CHECK-NEXT: ; %bb.1: ; %Flow +; CHECK-NEXT: ; %bb.1: ; %Flow6 ; CHECK-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; CHECK-NEXT: s_cbranch_vccz .LBB0_4 +; CHECK-NEXT: s_cbranch_vccz .LBB0_8 ; CHECK-NEXT: .LBB0_2: ; %atomicrmw.phi ; CHECK-NEXT: s_endpgm -; CHECK-NEXT: .LBB0_3: ; %atomicrmw.global -; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: .LBB0_3: ; %atomicrmw.check.private +; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base +; CHECK-NEXT: s_cmp_eq_u32 s1, s5 +; CHECK-NEXT: s_cselect_b64 s[4:5], -1, 0 +; CHECK-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; CHECK-NEXT: s_mov_b64 s[4:5], -1 +; CHECK-NEXT: s_cbranch_vccz .LBB0_5 +; CHECK-NEXT: ; %bb.4: ; %atomicrmw.global +; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; CHECK-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: s_cbranch_execnz .LBB0_2 -; CHECK-NEXT: .LBB0_4: ; %atomicrmw.private +; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: .LBB0_5: ; %Flow +; CHECK-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; CHECK-NEXT: s_cbranch_vccnz .LBB0_7 +; CHECK-NEXT: ; %bb.6: ; %atomicrmw.private ; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 -; CHECK-NEXT: s_cselect_b32 s0, s0, -1 -; CHECK-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-NEXT: s_cselect_b32 s4, s0, -1 +; CHECK-NEXT: v_mov_b32_e32 v2, s4 ; CHECK-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen ; CHECK-NEXT: buffer_load_dword v1, v2, s[12:15], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] ; CHECK-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen ; CHECK-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen offset:4 +; CHECK-NEXT: .LBB0_7: ; %Flow5 +; CHECK-NEXT: s_cbranch_execnz .LBB0_2 +; CHECK-NEXT: .LBB0_8: ; %atomicrmw.shared +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 +; CHECK-NEXT: s_cselect_b32 s0, s0, -1 +; CHECK-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; CHECK-NEXT: ds_add_f64 v2, v[0:1] +; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_endpgm entry: %i = add nsw i32 %a, -1 diff --git a/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll b/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll index 6ce9f4c367f48..258aa9e299c3d 100644 --- a/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll +++ b/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll @@ -122,25 +122,35 @@ define protected amdgpu_kernel void @InferPHI(i32 %a, ptr addrspace(1) %b, doubl ; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1] ; CHECK-NEXT: s_cbranch_vccnz .LBB3_1 ; CHECK-NEXT: ; %bb.2: ; %bb1 -; CHECK-NEXT: s_mov_b64 s[0:1], src_private_base +; CHECK-NEXT: s_mov_b64 s[0:1], src_shared_base ; CHECK-NEXT: s_cmp_eq_u32 s5, s1 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; CHECK-NEXT: s_mov_b64 s[0:1], -1 ; CHECK-NEXT: s_cbranch_vccnz .LBB3_5 -; CHECK-NEXT: ; %bb.3: ; %Flow +; CHECK-NEXT: ; %bb.3: ; %Flow6 ; CHECK-NEXT: s_andn2_b64 vcc, exec, s[0:1] -; CHECK-NEXT: s_cbranch_vccz .LBB3_6 +; CHECK-NEXT: s_cbranch_vccz .LBB3_10 ; CHECK-NEXT: .LBB3_4: ; %atomicrmw.phi ; CHECK-NEXT: s_endpgm -; CHECK-NEXT: .LBB3_5: ; %atomicrmw.global -; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: .LBB3_5: ; %atomicrmw.check.private +; CHECK-NEXT: s_mov_b64 s[0:1], src_private_base +; CHECK-NEXT: s_cmp_eq_u32 s5, s1 +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; CHECK-NEXT: s_mov_b64 s[0:1], -1 +; CHECK-NEXT: s_cbranch_vccz .LBB3_7 +; CHECK-NEXT: ; %bb.6: ; %atomicrmw.global +; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; CHECK-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: s_cbranch_execnz .LBB3_4 -; CHECK-NEXT: .LBB3_6: ; %atomicrmw.private +; CHECK-NEXT: s_mov_b64 s[0:1], 0 +; CHECK-NEXT: .LBB3_7: ; %Flow +; CHECK-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; CHECK-NEXT: s_cbranch_vccnz .LBB3_9 +; CHECK-NEXT: ; %bb.8: ; %atomicrmw.private ; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0 ; CHECK-NEXT: s_cselect_b32 s0, s4, -1 ; CHECK-NEXT: v_mov_b32_e32 v2, s0 @@ -150,6 +160,15 @@ define protected amdgpu_kernel void @InferPHI(i32 %a, ptr addrspace(1) %b, doubl ; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] ; CHECK-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen ; CHECK-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen offset:4 +; CHECK-NEXT: .LBB3_9: ; %Flow5 +; CHECK-NEXT: s_cbranch_execnz .LBB3_4 +; CHECK-NEXT: .LBB3_10: ; %atomicrmw.shared +; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0 +; CHECK-NEXT: s_cselect_b32 s0, s4, -1 +; CHECK-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; CHECK-NEXT: ds_add_f64 v2, v[0:1] +; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_endpgm entry: %i = add nsw i32 %a, -1 diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll index 4aee397a0152c..2ca143ffc1118 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll @@ -801,37 +801,53 @@ define double @test_atomicrmw_fadd_f64_flat_unsafe(ptr %ptr, double %value) #3 { ; GFX908-NEXT: ret double [[RES]] ; ; GFX90A-LABEL: @test_atomicrmw_fadd_f64_flat_unsafe( -; GFX90A-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR:%.*]]) +; GFX90A-NEXT: [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[PTR:%.*]]) +; GFX90A-NEXT: br i1 [[IS_SHARED]], label [[ATOMICRMW_SHARED:%.*]], label [[ATOMICRMW_CHECK_PRIVATE:%.*]] +; GFX90A: atomicrmw.shared: +; GFX90A-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(3) +; GFX90A-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(3) [[TMP3]], double [[VALUE:%.*]] syncscope("wavefront") monotonic, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX90A-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; GFX90A: atomicrmw.check.private: +; GFX90A-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) ; GFX90A-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] ; GFX90A: atomicrmw.private: ; GFX90A-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) ; GFX90A-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 -; GFX90A-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE:%.*]] +; GFX90A-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE]] ; GFX90A-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 -; GFX90A-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; GFX90A-NEXT: br label [[ATOMICRMW_PHI]] ; GFX90A: atomicrmw.global: -; GFX90A-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[PTR]], double [[VALUE]] syncscope("wavefront") monotonic, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX90A-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(1) +; GFX90A-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP4]], double [[VALUE]] syncscope("wavefront") monotonic, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; GFX90A-NEXT: br label [[ATOMICRMW_PHI]] ; GFX90A: atomicrmw.phi: -; GFX90A-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP2]], [[ATOMICRMW_GLOBAL]] ] +; GFX90A-NEXT: [[RES:%.*]] = phi double [ [[TMP2]], [[ATOMICRMW_SHARED]] ], [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP5]], [[ATOMICRMW_GLOBAL]] ] ; GFX90A-NEXT: br label [[ATOMICRMW_END:%.*]] ; GFX90A: atomicrmw.end: ; GFX90A-NEXT: ret double [[RES]] ; ; GFX942-LABEL: @test_atomicrmw_fadd_f64_flat_unsafe( -; GFX942-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR:%.*]]) +; GFX942-NEXT: [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[PTR:%.*]]) +; GFX942-NEXT: br i1 [[IS_SHARED]], label [[ATOMICRMW_SHARED:%.*]], label [[ATOMICRMW_CHECK_PRIVATE:%.*]] +; GFX942: atomicrmw.shared: +; GFX942-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(3) +; GFX942-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(3) [[TMP3]], double [[VALUE:%.*]] syncscope("wavefront") monotonic, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX942-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; GFX942: atomicrmw.check.private: +; GFX942-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) ; GFX942-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] ; GFX942: atomicrmw.private: ; GFX942-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) ; GFX942-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 -; GFX942-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE:%.*]] +; GFX942-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE]] ; GFX942-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 -; GFX942-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; GFX942-NEXT: br label [[ATOMICRMW_PHI]] ; GFX942: atomicrmw.global: -; GFX942-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[PTR]], double [[VALUE]] syncscope("wavefront") monotonic, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX942-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(1) +; GFX942-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP4]], double [[VALUE]] syncscope("wavefront") monotonic, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; GFX942-NEXT: br label [[ATOMICRMW_PHI]] ; GFX942: atomicrmw.phi: -; GFX942-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP2]], [[ATOMICRMW_GLOBAL]] ] +; GFX942-NEXT: [[RES:%.*]] = phi double [ [[TMP2]], [[ATOMICRMW_SHARED]] ], [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP5]], [[ATOMICRMW_GLOBAL]] ] ; GFX942-NEXT: br label [[ATOMICRMW_END:%.*]] ; GFX942: atomicrmw.end: ; GFX942-NEXT: ret double [[RES]] @@ -1227,35 +1243,201 @@ define half @test_atomicrmw_fadd_f16_local(ptr addrspace(3) %ptr, half %value) { } define double @test_atomicrmw_fadd_f64_flat(ptr %ptr, double %value) { -; ALL-LABEL: @test_atomicrmw_fadd_f64_flat( -; ALL-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR:%.*]]) -; ALL-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] -; ALL: atomicrmw.private: -; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) -; ALL-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 -; ALL-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE:%.*]] -; ALL-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 -; ALL-NEXT: br label [[ATOMICRMW_PHI:%.*]] -; ALL: atomicrmw.global: -; ALL-NEXT: [[TMP2:%.*]] = load double, ptr [[PTR]], align 8 -; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] -; ALL: atomicrmw.start: -; ALL-NEXT: [[LOADED:%.*]] = phi double [ [[TMP2]], [[ATOMICRMW_GLOBAL]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; ALL-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]] -; ALL-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64 -; ALL-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; ALL-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !noalias.addrspace [[META1:![0-9]+]] -; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 -; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 -; ALL-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double -; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END1:%.*]], label [[ATOMICRMW_START]] -; ALL: atomicrmw.end1: -; ALL-NEXT: br label [[ATOMICRMW_PHI]] -; ALL: atomicrmw.phi: -; ALL-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP6]], [[ATOMICRMW_END1]] ] -; ALL-NEXT: br label [[ATOMICRMW_END:%.*]] -; ALL: atomicrmw.end: -; ALL-NEXT: ret double [[RES]] +; CI-LABEL: @test_atomicrmw_fadd_f64_flat( +; CI-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR:%.*]]) +; CI-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] +; CI: atomicrmw.private: +; CI-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; CI-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; CI-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE:%.*]] +; CI-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; CI-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; CI: atomicrmw.global: +; CI-NEXT: [[TMP2:%.*]] = load double, ptr [[PTR]], align 8 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi double [ [[TMP2]], [[ATOMICRMW_GLOBAL]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]] +; CI-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64 +; CI-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; CI-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !noalias.addrspace [[META1]] +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; CI-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END1:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end1: +; CI-NEXT: br label [[ATOMICRMW_PHI]] +; CI: atomicrmw.phi: +; CI-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP6]], [[ATOMICRMW_END1]] ] +; CI-NEXT: br label [[ATOMICRMW_END:%.*]] +; CI: atomicrmw.end: +; CI-NEXT: ret double [[RES]] +; +; GFX9-LABEL: @test_atomicrmw_fadd_f64_flat( +; GFX9-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR:%.*]]) +; GFX9-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] +; GFX9: atomicrmw.private: +; GFX9-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX9-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX9-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE:%.*]] +; GFX9-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; GFX9-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; GFX9: atomicrmw.global: +; GFX9-NEXT: [[TMP2:%.*]] = load double, ptr [[PTR]], align 8 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi double [ [[TMP2]], [[ATOMICRMW_GLOBAL]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX9-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64 +; GFX9-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX9-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !noalias.addrspace [[META1]] +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX9-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END1:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end1: +; GFX9-NEXT: br label [[ATOMICRMW_PHI]] +; GFX9: atomicrmw.phi: +; GFX9-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP6]], [[ATOMICRMW_END1]] ] +; GFX9-NEXT: br label [[ATOMICRMW_END:%.*]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret double [[RES]] +; +; GFX908-LABEL: @test_atomicrmw_fadd_f64_flat( +; GFX908-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR:%.*]]) +; GFX908-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] +; GFX908: atomicrmw.private: +; GFX908-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX908-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE:%.*]] +; GFX908-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; GFX908: atomicrmw.global: +; GFX908-NEXT: [[TMP2:%.*]] = load double, ptr [[PTR]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP2]], [[ATOMICRMW_GLOBAL]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !noalias.addrspace [[META1]] +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END1:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end1: +; GFX908-NEXT: br label [[ATOMICRMW_PHI]] +; GFX908: atomicrmw.phi: +; GFX908-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP6]], [[ATOMICRMW_END1]] ] +; GFX908-NEXT: br label [[ATOMICRMW_END:%.*]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret double [[RES]] +; +; GFX90A-LABEL: @test_atomicrmw_fadd_f64_flat( +; GFX90A-NEXT: [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[PTR:%.*]]) +; GFX90A-NEXT: br i1 [[IS_SHARED]], label [[ATOMICRMW_SHARED:%.*]], label [[ATOMICRMW_CHECK_PRIVATE:%.*]] +; GFX90A: atomicrmw.shared: +; GFX90A-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(3) +; GFX90A-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(3) [[TMP1]], double [[VALUE:%.*]] seq_cst, align 8 +; GFX90A-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; GFX90A: atomicrmw.check.private: +; GFX90A-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; GFX90A-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] +; GFX90A: atomicrmw.private: +; GFX90A-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX90A-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP3]], align 8 +; GFX90A-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE]] +; GFX90A-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP3]], align 8 +; GFX90A-NEXT: br label [[ATOMICRMW_PHI]] +; GFX90A: atomicrmw.global: +; GFX90A-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(1) +; GFX90A-NEXT: [[TMP5:%.*]] = load double, ptr addrspace(1) [[TMP4]], align 8 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi double [ [[TMP5]], [[ATOMICRMW_GLOBAL]] ], [ [[TMP9:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP6:%.*]] = bitcast double [[NEW2]] to i64 +; GFX90A-NEXT: [[TMP7:%.*]] = bitcast double [[LOADED]] to i64 +; GFX90A-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[TMP4]], i64 [[TMP7]], i64 [[TMP6]] seq_cst seq_cst, align 8 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP8]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP8]], 0 +; GFX90A-NEXT: [[TMP9]] = bitcast i64 [[NEWLOADED]] to double +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END1:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end1: +; GFX90A-NEXT: br label [[ATOMICRMW_PHI]] +; GFX90A: atomicrmw.phi: +; GFX90A-NEXT: [[RES:%.*]] = phi double [ [[TMP2]], [[ATOMICRMW_SHARED]] ], [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP9]], [[ATOMICRMW_END1]] ] +; GFX90A-NEXT: br label [[ATOMICRMW_END:%.*]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret double [[RES]] +; +; GFX942-LABEL: @test_atomicrmw_fadd_f64_flat( +; GFX942-NEXT: [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[PTR:%.*]]) +; GFX942-NEXT: br i1 [[IS_SHARED]], label [[ATOMICRMW_SHARED:%.*]], label [[ATOMICRMW_CHECK_PRIVATE:%.*]] +; GFX942: atomicrmw.shared: +; GFX942-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(3) +; GFX942-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(3) [[TMP1]], double [[VALUE:%.*]] seq_cst, align 8 +; GFX942-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; GFX942: atomicrmw.check.private: +; GFX942-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; GFX942-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] +; GFX942: atomicrmw.private: +; GFX942-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX942-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP3]], align 8 +; GFX942-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE]] +; GFX942-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP3]], align 8 +; GFX942-NEXT: br label [[ATOMICRMW_PHI]] +; GFX942: atomicrmw.global: +; GFX942-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(1) +; GFX942-NEXT: [[TMP5:%.*]] = load double, ptr addrspace(1) [[TMP4]], align 8 +; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX942: atomicrmw.start: +; GFX942-NEXT: [[LOADED:%.*]] = phi double [ [[TMP5]], [[ATOMICRMW_GLOBAL]] ], [ [[TMP9:%.*]], [[ATOMICRMW_START]] ] +; GFX942-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX942-NEXT: [[TMP6:%.*]] = bitcast double [[NEW2]] to i64 +; GFX942-NEXT: [[TMP7:%.*]] = bitcast double [[LOADED]] to i64 +; GFX942-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[TMP4]], i64 [[TMP7]], i64 [[TMP6]] seq_cst seq_cst, align 8 +; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP8]], 1 +; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP8]], 0 +; GFX942-NEXT: [[TMP9]] = bitcast i64 [[NEWLOADED]] to double +; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END1:%.*]], label [[ATOMICRMW_START]] +; GFX942: atomicrmw.end1: +; GFX942-NEXT: br label [[ATOMICRMW_PHI]] +; GFX942: atomicrmw.phi: +; GFX942-NEXT: [[RES:%.*]] = phi double [ [[TMP2]], [[ATOMICRMW_SHARED]] ], [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP9]], [[ATOMICRMW_END1]] ] +; GFX942-NEXT: br label [[ATOMICRMW_END:%.*]] +; GFX942: atomicrmw.end: +; GFX942-NEXT: ret double [[RES]] +; +; GFX11-LABEL: @test_atomicrmw_fadd_f64_flat( +; GFX11-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR:%.*]]) +; GFX11-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] +; GFX11: atomicrmw.private: +; GFX11-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX11-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE:%.*]] +; GFX11-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; GFX11: atomicrmw.global: +; GFX11-NEXT: [[TMP2:%.*]] = load double, ptr [[PTR]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP2]], [[ATOMICRMW_GLOBAL]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64 +; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !noalias.addrspace [[META1]] +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END1:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end1: +; GFX11-NEXT: br label [[ATOMICRMW_PHI]] +; GFX11: atomicrmw.phi: +; GFX11-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP6]], [[ATOMICRMW_END1]] ] +; GFX11-NEXT: br label [[ATOMICRMW_END:%.*]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret double [[RES]] ; %res = atomicrmw fadd ptr %ptr, double %value seq_cst ret double %res @@ -1270,7 +1452,7 @@ define double @test_atomicrmw_fadd_f64_flat__noprivate(ptr %ptr, double %value) ; ALL-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] ; ALL-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; ALL-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !noalias.addrspace [[META1]] +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !noalias.addrspace [[META1:![0-9]+]] ; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; ALL-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -2814,69 +2996,396 @@ define float @test_atomicrmw_fadd_f32_flat_system_ret__amdgpu_ignore_denormal_mo } define void @test_atomicrmw_fadd_f64_dyndenorm_flat_system_noret__amdgpu_ignore_denormal_mode(ptr %ptr, double %value) #5 { -; ALL-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_flat_system_noret__amdgpu_ignore_denormal_mode( -; ALL-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR:%.*]]) -; ALL-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] -; ALL: atomicrmw.private: -; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) -; ALL-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 -; ALL-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE:%.*]] -; ALL-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 -; ALL-NEXT: br label [[ATOMICRMW_PHI:%.*]] -; ALL: atomicrmw.global: -; ALL-NEXT: [[TMP2:%.*]] = load double, ptr [[PTR]], align 8 -; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] -; ALL: atomicrmw.start: -; ALL-NEXT: [[LOADED:%.*]] = phi double [ [[TMP2]], [[ATOMICRMW_GLOBAL]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; ALL-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]] -; ALL-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64 -; ALL-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; ALL-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] monotonic monotonic, align 8, !noalias.addrspace [[META1]] -; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 -; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 -; ALL-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double -; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END1:%.*]], label [[ATOMICRMW_START]] -; ALL: atomicrmw.end1: -; ALL-NEXT: br label [[ATOMICRMW_PHI]] -; ALL: atomicrmw.phi: -; ALL-NEXT: br label [[ATOMICRMW_END:%.*]] -; ALL: atomicrmw.end: -; ALL-NEXT: ret void +; CI-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_flat_system_noret__amdgpu_ignore_denormal_mode( +; CI-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR:%.*]]) +; CI-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] +; CI: atomicrmw.private: +; CI-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; CI-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; CI-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE:%.*]] +; CI-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; CI-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; CI: atomicrmw.global: +; CI-NEXT: [[TMP2:%.*]] = load double, ptr [[PTR]], align 8 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi double [ [[TMP2]], [[ATOMICRMW_GLOBAL]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]] +; CI-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64 +; CI-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; CI-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] monotonic monotonic, align 8, !noalias.addrspace [[META1]] +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; CI-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END1:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end1: +; CI-NEXT: br label [[ATOMICRMW_PHI]] +; CI: atomicrmw.phi: +; CI-NEXT: br label [[ATOMICRMW_END:%.*]] +; CI: atomicrmw.end: +; CI-NEXT: ret void +; +; GFX9-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_flat_system_noret__amdgpu_ignore_denormal_mode( +; GFX9-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR:%.*]]) +; GFX9-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] +; GFX9: atomicrmw.private: +; GFX9-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX9-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX9-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE:%.*]] +; GFX9-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; GFX9-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; GFX9: atomicrmw.global: +; GFX9-NEXT: [[TMP2:%.*]] = load double, ptr [[PTR]], align 8 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi double [ [[TMP2]], [[ATOMICRMW_GLOBAL]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX9-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64 +; GFX9-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX9-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] monotonic monotonic, align 8, !noalias.addrspace [[META1]] +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX9-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END1:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end1: +; GFX9-NEXT: br label [[ATOMICRMW_PHI]] +; GFX9: atomicrmw.phi: +; GFX9-NEXT: br label [[ATOMICRMW_END:%.*]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret void +; +; GFX908-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_flat_system_noret__amdgpu_ignore_denormal_mode( +; GFX908-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR:%.*]]) +; GFX908-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] +; GFX908: atomicrmw.private: +; GFX908-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX908-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE:%.*]] +; GFX908-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; GFX908: atomicrmw.global: +; GFX908-NEXT: [[TMP2:%.*]] = load double, ptr [[PTR]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP2]], [[ATOMICRMW_GLOBAL]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] monotonic monotonic, align 8, !noalias.addrspace [[META1]] +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END1:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end1: +; GFX908-NEXT: br label [[ATOMICRMW_PHI]] +; GFX908: atomicrmw.phi: +; GFX908-NEXT: br label [[ATOMICRMW_END:%.*]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_flat_system_noret__amdgpu_ignore_denormal_mode( +; GFX90A-NEXT: [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[PTR:%.*]]) +; GFX90A-NEXT: br i1 [[IS_SHARED]], label [[ATOMICRMW_SHARED:%.*]], label [[ATOMICRMW_CHECK_PRIVATE:%.*]] +; GFX90A: atomicrmw.shared: +; GFX90A-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(3) +; GFX90A-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(3) [[TMP1]], double [[VALUE:%.*]] monotonic, align 8, !amdgpu.ignore.denormal.mode [[META0]] +; GFX90A-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; GFX90A: atomicrmw.check.private: +; GFX90A-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; GFX90A-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] +; GFX90A: atomicrmw.private: +; GFX90A-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX90A-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP3]], align 8 +; GFX90A-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE]] +; GFX90A-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP3]], align 8 +; GFX90A-NEXT: br label [[ATOMICRMW_PHI]] +; GFX90A: atomicrmw.global: +; GFX90A-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(1) +; GFX90A-NEXT: [[TMP5:%.*]] = load double, ptr addrspace(1) [[TMP4]], align 8 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi double [ [[TMP5]], [[ATOMICRMW_GLOBAL]] ], [ [[TMP9:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP6:%.*]] = bitcast double [[NEW2]] to i64 +; GFX90A-NEXT: [[TMP7:%.*]] = bitcast double [[LOADED]] to i64 +; GFX90A-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[TMP4]], i64 [[TMP7]], i64 [[TMP6]] monotonic monotonic, align 8 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP8]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP8]], 0 +; GFX90A-NEXT: [[TMP9]] = bitcast i64 [[NEWLOADED]] to double +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END1:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end1: +; GFX90A-NEXT: br label [[ATOMICRMW_PHI]] +; GFX90A: atomicrmw.phi: +; GFX90A-NEXT: br label [[ATOMICRMW_END:%.*]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX942-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_flat_system_noret__amdgpu_ignore_denormal_mode( +; GFX942-NEXT: [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[PTR:%.*]]) +; GFX942-NEXT: br i1 [[IS_SHARED]], label [[ATOMICRMW_SHARED:%.*]], label [[ATOMICRMW_CHECK_PRIVATE:%.*]] +; GFX942: atomicrmw.shared: +; GFX942-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(3) +; GFX942-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(3) [[TMP1]], double [[VALUE:%.*]] monotonic, align 8, !amdgpu.ignore.denormal.mode [[META0]] +; GFX942-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; GFX942: atomicrmw.check.private: +; GFX942-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; GFX942-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] +; GFX942: atomicrmw.private: +; GFX942-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX942-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP3]], align 8 +; GFX942-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE]] +; GFX942-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP3]], align 8 +; GFX942-NEXT: br label [[ATOMICRMW_PHI]] +; GFX942: atomicrmw.global: +; GFX942-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(1) +; GFX942-NEXT: [[TMP5:%.*]] = load double, ptr addrspace(1) [[TMP4]], align 8 +; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX942: atomicrmw.start: +; GFX942-NEXT: [[LOADED:%.*]] = phi double [ [[TMP5]], [[ATOMICRMW_GLOBAL]] ], [ [[TMP9:%.*]], [[ATOMICRMW_START]] ] +; GFX942-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX942-NEXT: [[TMP6:%.*]] = bitcast double [[NEW2]] to i64 +; GFX942-NEXT: [[TMP7:%.*]] = bitcast double [[LOADED]] to i64 +; GFX942-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[TMP4]], i64 [[TMP7]], i64 [[TMP6]] monotonic monotonic, align 8 +; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP8]], 1 +; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP8]], 0 +; GFX942-NEXT: [[TMP9]] = bitcast i64 [[NEWLOADED]] to double +; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END1:%.*]], label [[ATOMICRMW_START]] +; GFX942: atomicrmw.end1: +; GFX942-NEXT: br label [[ATOMICRMW_PHI]] +; GFX942: atomicrmw.phi: +; GFX942-NEXT: br label [[ATOMICRMW_END:%.*]] +; GFX942: atomicrmw.end: +; GFX942-NEXT: ret void +; +; GFX11-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_flat_system_noret__amdgpu_ignore_denormal_mode( +; GFX11-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR:%.*]]) +; GFX11-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] +; GFX11: atomicrmw.private: +; GFX11-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX11-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE:%.*]] +; GFX11-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; GFX11: atomicrmw.global: +; GFX11-NEXT: [[TMP2:%.*]] = load double, ptr [[PTR]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP2]], [[ATOMICRMW_GLOBAL]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64 +; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] monotonic monotonic, align 8, !noalias.addrspace [[META1]] +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END1:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end1: +; GFX11-NEXT: br label [[ATOMICRMW_PHI]] +; GFX11: atomicrmw.phi: +; GFX11-NEXT: br label [[ATOMICRMW_END:%.*]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void ; %unused = atomicrmw fadd ptr %ptr, double %value monotonic, !amdgpu.ignore.denormal.mode !0 ret void } define double @test_atomicrmw_fadd_f64_dyndenorm_flat_system_ret__amdgpu_ignore_denormal_mode(ptr %ptr, double %value) #5 { -; ALL-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_flat_system_ret__amdgpu_ignore_denormal_mode( -; ALL-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR:%.*]]) -; ALL-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] -; ALL: atomicrmw.private: -; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) -; ALL-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 -; ALL-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE:%.*]] -; ALL-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 -; ALL-NEXT: br label [[ATOMICRMW_PHI:%.*]] -; ALL: atomicrmw.global: -; ALL-NEXT: [[TMP2:%.*]] = load double, ptr [[PTR]], align 8 -; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] -; ALL: atomicrmw.start: -; ALL-NEXT: [[LOADED:%.*]] = phi double [ [[TMP2]], [[ATOMICRMW_GLOBAL]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; ALL-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]] -; ALL-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64 -; ALL-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; ALL-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] monotonic monotonic, align 8, !noalias.addrspace [[META1]] -; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 -; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 -; ALL-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double -; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END1:%.*]], label [[ATOMICRMW_START]] -; ALL: atomicrmw.end1: -; ALL-NEXT: br label [[ATOMICRMW_PHI]] -; ALL: atomicrmw.phi: -; ALL-NEXT: [[RET:%.*]] = phi double [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP6]], [[ATOMICRMW_END1]] ] -; ALL-NEXT: br label [[ATOMICRMW_END:%.*]] -; ALL: atomicrmw.end: -; ALL-NEXT: ret double [[RET]] +; CI-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_flat_system_ret__amdgpu_ignore_denormal_mode( +; CI-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR:%.*]]) +; CI-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] +; CI: atomicrmw.private: +; CI-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; CI-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; CI-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE:%.*]] +; CI-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; CI-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; CI: atomicrmw.global: +; CI-NEXT: [[TMP2:%.*]] = load double, ptr [[PTR]], align 8 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi double [ [[TMP2]], [[ATOMICRMW_GLOBAL]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]] +; CI-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64 +; CI-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; CI-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] monotonic monotonic, align 8, !noalias.addrspace [[META1]] +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; CI-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END1:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end1: +; CI-NEXT: br label [[ATOMICRMW_PHI]] +; CI: atomicrmw.phi: +; CI-NEXT: [[RET:%.*]] = phi double [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP6]], [[ATOMICRMW_END1]] ] +; CI-NEXT: br label [[ATOMICRMW_END:%.*]] +; CI: atomicrmw.end: +; CI-NEXT: ret double [[RET]] +; +; GFX9-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_flat_system_ret__amdgpu_ignore_denormal_mode( +; GFX9-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR:%.*]]) +; GFX9-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] +; GFX9: atomicrmw.private: +; GFX9-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX9-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX9-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE:%.*]] +; GFX9-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; GFX9-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; GFX9: atomicrmw.global: +; GFX9-NEXT: [[TMP2:%.*]] = load double, ptr [[PTR]], align 8 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi double [ [[TMP2]], [[ATOMICRMW_GLOBAL]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX9-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64 +; GFX9-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX9-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] monotonic monotonic, align 8, !noalias.addrspace [[META1]] +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX9-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END1:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end1: +; GFX9-NEXT: br label [[ATOMICRMW_PHI]] +; GFX9: atomicrmw.phi: +; GFX9-NEXT: [[RET:%.*]] = phi double [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP6]], [[ATOMICRMW_END1]] ] +; GFX9-NEXT: br label [[ATOMICRMW_END:%.*]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret double [[RET]] +; +; GFX908-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_flat_system_ret__amdgpu_ignore_denormal_mode( +; GFX908-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR:%.*]]) +; GFX908-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] +; GFX908: atomicrmw.private: +; GFX908-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX908-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE:%.*]] +; GFX908-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; GFX908: atomicrmw.global: +; GFX908-NEXT: [[TMP2:%.*]] = load double, ptr [[PTR]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP2]], [[ATOMICRMW_GLOBAL]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] monotonic monotonic, align 8, !noalias.addrspace [[META1]] +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END1:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end1: +; GFX908-NEXT: br label [[ATOMICRMW_PHI]] +; GFX908: atomicrmw.phi: +; GFX908-NEXT: [[RET:%.*]] = phi double [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP6]], [[ATOMICRMW_END1]] ] +; GFX908-NEXT: br label [[ATOMICRMW_END:%.*]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret double [[RET]] +; +; GFX90A-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_flat_system_ret__amdgpu_ignore_denormal_mode( +; GFX90A-NEXT: [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[PTR:%.*]]) +; GFX90A-NEXT: br i1 [[IS_SHARED]], label [[ATOMICRMW_SHARED:%.*]], label [[ATOMICRMW_CHECK_PRIVATE:%.*]] +; GFX90A: atomicrmw.shared: +; GFX90A-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(3) +; GFX90A-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(3) [[TMP1]], double [[VALUE:%.*]] monotonic, align 8, !amdgpu.ignore.denormal.mode [[META0]] +; GFX90A-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; GFX90A: atomicrmw.check.private: +; GFX90A-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; GFX90A-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] +; GFX90A: atomicrmw.private: +; GFX90A-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX90A-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP3]], align 8 +; GFX90A-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE]] +; GFX90A-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP3]], align 8 +; GFX90A-NEXT: br label [[ATOMICRMW_PHI]] +; GFX90A: atomicrmw.global: +; GFX90A-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(1) +; GFX90A-NEXT: [[TMP5:%.*]] = load double, ptr addrspace(1) [[TMP4]], align 8 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi double [ [[TMP5]], [[ATOMICRMW_GLOBAL]] ], [ [[TMP9:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP6:%.*]] = bitcast double [[NEW2]] to i64 +; GFX90A-NEXT: [[TMP7:%.*]] = bitcast double [[LOADED]] to i64 +; GFX90A-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[TMP4]], i64 [[TMP7]], i64 [[TMP6]] monotonic monotonic, align 8 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP8]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP8]], 0 +; GFX90A-NEXT: [[TMP9]] = bitcast i64 [[NEWLOADED]] to double +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END1:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end1: +; GFX90A-NEXT: br label [[ATOMICRMW_PHI]] +; GFX90A: atomicrmw.phi: +; GFX90A-NEXT: [[RET:%.*]] = phi double [ [[TMP2]], [[ATOMICRMW_SHARED]] ], [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP9]], [[ATOMICRMW_END1]] ] +; GFX90A-NEXT: br label [[ATOMICRMW_END:%.*]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret double [[RET]] +; +; GFX942-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_flat_system_ret__amdgpu_ignore_denormal_mode( +; GFX942-NEXT: [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[PTR:%.*]]) +; GFX942-NEXT: br i1 [[IS_SHARED]], label [[ATOMICRMW_SHARED:%.*]], label [[ATOMICRMW_CHECK_PRIVATE:%.*]] +; GFX942: atomicrmw.shared: +; GFX942-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(3) +; GFX942-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(3) [[TMP1]], double [[VALUE:%.*]] monotonic, align 8, !amdgpu.ignore.denormal.mode [[META0]] +; GFX942-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; GFX942: atomicrmw.check.private: +; GFX942-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; GFX942-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] +; GFX942: atomicrmw.private: +; GFX942-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX942-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP3]], align 8 +; GFX942-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE]] +; GFX942-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP3]], align 8 +; GFX942-NEXT: br label [[ATOMICRMW_PHI]] +; GFX942: atomicrmw.global: +; GFX942-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(1) +; GFX942-NEXT: [[TMP5:%.*]] = load double, ptr addrspace(1) [[TMP4]], align 8 +; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX942: atomicrmw.start: +; GFX942-NEXT: [[LOADED:%.*]] = phi double [ [[TMP5]], [[ATOMICRMW_GLOBAL]] ], [ [[TMP9:%.*]], [[ATOMICRMW_START]] ] +; GFX942-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX942-NEXT: [[TMP6:%.*]] = bitcast double [[NEW2]] to i64 +; GFX942-NEXT: [[TMP7:%.*]] = bitcast double [[LOADED]] to i64 +; GFX942-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[TMP4]], i64 [[TMP7]], i64 [[TMP6]] monotonic monotonic, align 8 +; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP8]], 1 +; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP8]], 0 +; GFX942-NEXT: [[TMP9]] = bitcast i64 [[NEWLOADED]] to double +; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END1:%.*]], label [[ATOMICRMW_START]] +; GFX942: atomicrmw.end1: +; GFX942-NEXT: br label [[ATOMICRMW_PHI]] +; GFX942: atomicrmw.phi: +; GFX942-NEXT: [[RET:%.*]] = phi double [ [[TMP2]], [[ATOMICRMW_SHARED]] ], [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP9]], [[ATOMICRMW_END1]] ] +; GFX942-NEXT: br label [[ATOMICRMW_END:%.*]] +; GFX942: atomicrmw.end: +; GFX942-NEXT: ret double [[RET]] +; +; GFX11-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_flat_system_ret__amdgpu_ignore_denormal_mode( +; GFX11-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR:%.*]]) +; GFX11-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] +; GFX11: atomicrmw.private: +; GFX11-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX11-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE:%.*]] +; GFX11-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; GFX11: atomicrmw.global: +; GFX11-NEXT: [[TMP2:%.*]] = load double, ptr [[PTR]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP2]], [[ATOMICRMW_GLOBAL]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64 +; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] monotonic monotonic, align 8, !noalias.addrspace [[META1]] +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END1:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end1: +; GFX11-NEXT: br label [[ATOMICRMW_PHI]] +; GFX11: atomicrmw.phi: +; GFX11-NEXT: [[RET:%.*]] = phi double [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP6]], [[ATOMICRMW_END1]] ] +; GFX11-NEXT: br label [[ATOMICRMW_END:%.*]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret double [[RET]] ; %ret = atomicrmw fadd ptr %ptr, double %value monotonic, !amdgpu.ignore.denormal.mode !0 ret double %ret diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-flat-noalias-addrspace.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-flat-noalias-addrspace.ll index 7692fd34312ff..1bf821fd53eab 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-flat-noalias-addrspace.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-flat-noalias-addrspace.ll @@ -577,6 +577,13 @@ define double @test_flat_atomicrmw_fadd_f64_agent(ptr %ptr, double %value) { ; ; GFX90A-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent( ; GFX90A-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[PTR]]) +; GFX90A-NEXT: br i1 [[IS_SHARED]], label %[[ATOMICRMW_SHARED:.*]], label %[[ATOMICRMW_CHECK_PRIVATE:.*]] +; GFX90A: [[ATOMICRMW_SHARED]]: +; GFX90A-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(3) +; GFX90A-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(3) [[TMP3]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]] +; GFX90A-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; GFX90A: [[ATOMICRMW_CHECK_PRIVATE]]: ; GFX90A-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) ; GFX90A-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] ; GFX90A: [[ATOMICRMW_PRIVATE]]: @@ -584,18 +591,26 @@ define double @test_flat_atomicrmw_fadd_f64_agent(ptr %ptr, double %value) { ; GFX90A-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 ; GFX90A-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE]] ; GFX90A-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 -; GFX90A-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; GFX90A-NEXT: br label %[[ATOMICRMW_PHI]] ; GFX90A: [[ATOMICRMW_GLOBAL]]: -; GFX90A-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; GFX90A-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(1) +; GFX90A-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP4]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]] ; GFX90A-NEXT: br label %[[ATOMICRMW_PHI]] ; GFX90A: [[ATOMICRMW_PHI]]: -; GFX90A-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP2]], %[[ATOMICRMW_GLOBAL]] ] +; GFX90A-NEXT: [[RES:%.*]] = phi double [ [[TMP2]], %[[ATOMICRMW_SHARED]] ], [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP5]], %[[ATOMICRMW_GLOBAL]] ] ; GFX90A-NEXT: br label %[[ATOMICRMW_END:.*]] ; GFX90A: [[ATOMICRMW_END]]: ; GFX90A-NEXT: ret double [[RES]] ; ; GFX942-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent( ; GFX942-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX942-NEXT: [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[PTR]]) +; GFX942-NEXT: br i1 [[IS_SHARED]], label %[[ATOMICRMW_SHARED:.*]], label %[[ATOMICRMW_CHECK_PRIVATE:.*]] +; GFX942: [[ATOMICRMW_SHARED]]: +; GFX942-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(3) +; GFX942-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(3) [[TMP3]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]] +; GFX942-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; GFX942: [[ATOMICRMW_CHECK_PRIVATE]]: ; GFX942-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) ; GFX942-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] ; GFX942: [[ATOMICRMW_PRIVATE]]: @@ -603,12 +618,13 @@ define double @test_flat_atomicrmw_fadd_f64_agent(ptr %ptr, double %value) { ; GFX942-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 ; GFX942-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE]] ; GFX942-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 -; GFX942-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; GFX942-NEXT: br label %[[ATOMICRMW_PHI]] ; GFX942: [[ATOMICRMW_GLOBAL]]: -; GFX942-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; GFX942-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(1) +; GFX942-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP4]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]] ; GFX942-NEXT: br label %[[ATOMICRMW_PHI]] ; GFX942: [[ATOMICRMW_PHI]]: -; GFX942-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP2]], %[[ATOMICRMW_GLOBAL]] ] +; GFX942-NEXT: [[RES:%.*]] = phi double [ [[TMP2]], %[[ATOMICRMW_SHARED]] ], [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP5]], %[[ATOMICRMW_GLOBAL]] ] ; GFX942-NEXT: br label %[[ATOMICRMW_END:.*]] ; GFX942: [[ATOMICRMW_END]]: ; GFX942-NEXT: ret double [[RES]]