@@ -2213,29 +2213,17 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
22132213; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
22142214; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
22152215; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
2216- ; GFX90A-NEXT: s_cbranch_execz .LBB72_3
2216+ ; GFX90A-NEXT: s_cbranch_execz .LBB72_2
22172217; GFX90A-NEXT: ; %bb.1:
2218- ; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x24
2219- ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
2220- ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
2221- ; GFX90A-NEXT: v_mov_b32_e32 v0, s4
2222- ; GFX90A-NEXT: ds_read_b64 v[2:3], v0
2223- ; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
2218+ ; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24
2219+ ; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3]
2220+ ; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s1
22242221; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
2225- ; GFX90A-NEXT: s_mov_b64 s[0:1], 0
2226- ; GFX90A-NEXT: v_mov_b32_e32 v4, s4
2227- ; GFX90A-NEXT: .LBB72_2: ; %atomicrmw.start
2228- ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
22292222; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
2230- ; GFX90A-NEXT: v_add_f64 v[6:7], v[2:3], v[0:1]
2231- ; GFX90A-NEXT: ds_cmpst_rtn_b64 v[6:7], v4, v[2:3], v[6:7 ]
2223+ ; GFX90A-NEXT: v_mov_b32_e32 v2, s0
2224+ ; GFX90A-NEXT: ds_add_f64 v2, v[0:1 ]
22322225; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
2233- ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
2234- ; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
2235- ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[6:7] op_sel:[0,1]
2236- ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1]
2237- ; GFX90A-NEXT: s_cbranch_execnz .LBB72_2
2238- ; GFX90A-NEXT: .LBB72_3:
2226+ ; GFX90A-NEXT: .LBB72_2:
22392227; GFX90A-NEXT: s_endpgm
22402228;
22412229; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe:
@@ -2245,29 +2233,17 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
22452233; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
22462234; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
22472235; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
2248- ; GFX940-NEXT: s_cbranch_execz .LBB72_3
2236+ ; GFX940-NEXT: s_cbranch_execz .LBB72_2
22492237; GFX940-NEXT: ; %bb.1:
2250- ; GFX940-NEXT: s_load_dword s4, s[0:1], 0x24
2251- ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
2252- ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
2253- ; GFX940-NEXT: v_mov_b32_e32 v0, s4
2254- ; GFX940-NEXT: ds_read_b64 v[2:3], v0
2255- ; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
2238+ ; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24
2239+ ; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3]
2240+ ; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s1
22562241; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
2257- ; GFX940-NEXT: s_mov_b64 s[0:1], 0
2258- ; GFX940-NEXT: v_mov_b32_e32 v4, s4
2259- ; GFX940-NEXT: .LBB72_2: ; %atomicrmw.start
2260- ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
22612242; GFX940-NEXT: s_waitcnt lgkmcnt(0)
2262- ; GFX940-NEXT: v_add_f64 v[6:7], v[2:3], v[0:1]
2263- ; GFX940-NEXT: ds_cmpst_rtn_b64 v[6:7], v4, v[2:3], v[6:7 ]
2243+ ; GFX940-NEXT: v_mov_b32_e32 v2, s0
2244+ ; GFX940-NEXT: ds_add_f64 v2, v[0:1 ]
22642245; GFX940-NEXT: s_waitcnt lgkmcnt(0)
2265- ; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
2266- ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
2267- ; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[6:7]
2268- ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
2269- ; GFX940-NEXT: s_cbranch_execnz .LBB72_2
2270- ; GFX940-NEXT: .LBB72_3:
2246+ ; GFX940-NEXT: .LBB72_2:
22712247; GFX940-NEXT: s_endpgm
22722248main_body:
22732249 %ret = atomicrmw fadd ptr addrspace (3 ) %ptr , double 4 .0 seq_cst
0 commit comments