@@ -40,10 +40,11 @@ define amdgpu_kernel void @flat_atomic_usub_cond_no_rtn_u32(ptr %addr, i32 %in)
4040; GFX12-SDAG: ; %bb.0: ; %entry
4141; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
4242; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
43- ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
43+ ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
44+ ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
4445; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SYS
4546; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
46- ; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v0, v1, s [0:1] offset:-16 scope:SCOPE_SYS
47+ ; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v [0:1], v2 offset:-16 scope:SCOPE_SYS
4748; GFX12-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
4849; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS
4950; GFX12-SDAG-NEXT: s_endpgm
@@ -80,10 +81,14 @@ define amdgpu_kernel void @flat_atomic_usub_cond_no_rtn_u32(ptr %addr, i32 %in)
8081; GFX12-GISEL: ; %bb.0: ; %entry
8182; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
8283; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
83- ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
84+ ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, -16
85+ ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, -1
86+ ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
87+ ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
88+ ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2
8489; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SYS
8590; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
86- ; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v1, v0, s [0:1] offset:-16 scope:SCOPE_SYS
91+ ; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v [0:1], v2 scope:SCOPE_SYS
8792; GFX12-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
8893; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS
8994; GFX12-GISEL-NEXT: s_endpgm
@@ -129,10 +134,11 @@ define amdgpu_kernel void @flat_atomic_usub_cond_no_rtn_u32_forced(ptr %addr, i3
129134; GFX12-SDAG: ; %bb.0: ; %entry
130135; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
131136; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
132- ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
137+ ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
138+ ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
133139; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SYS
134140; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
135- ; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v0, v1, s [0:1] offset:-16 scope:SCOPE_SYS
141+ ; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v [0:1], v2 offset:-16 scope:SCOPE_SYS
136142; GFX12-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
137143; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS
138144; GFX12-SDAG-NEXT: s_endpgm
@@ -169,10 +175,14 @@ define amdgpu_kernel void @flat_atomic_usub_cond_no_rtn_u32_forced(ptr %addr, i3
169175; GFX12-GISEL: ; %bb.0: ; %entry
170176; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
171177; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
172- ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
178+ ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, -16
179+ ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, -1
180+ ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
181+ ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
182+ ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2
173183; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SYS
174184; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
175- ; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v1, v0, s [0:1] offset:-16 scope:SCOPE_SYS
185+ ; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v [0:1], v2 scope:SCOPE_SYS
176186; GFX12-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
177187; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS
178188; GFX12-GISEL-NEXT: s_endpgm
@@ -217,14 +227,15 @@ define amdgpu_kernel void @flat_atomic_usub_cond_rtn_u32(ptr %addr, i32 %in, ptr
217227;
218228; GFX12-SDAG-LABEL: flat_atomic_usub_cond_rtn_u32:
219229; GFX12-SDAG: ; %bb.0: ; %entry
230+ ; GFX12-SDAG-NEXT: s_clause 0x1
220231; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
221- ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
222232; GFX12-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
223233; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
224- ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2
234+ ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
235+ ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
225236; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SYS
226237; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
227- ; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v2, v0, v1, s [0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
238+ ; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v2, v [0:1], v2 offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
228239; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
229240; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS
230241; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
@@ -234,33 +245,39 @@ define amdgpu_kernel void @flat_atomic_usub_cond_rtn_u32(ptr %addr, i32 %in, ptr
234245; GFX9-GISEL-LABEL: flat_atomic_usub_cond_rtn_u32:
235246; GFX9-GISEL: ; %bb.0: ; %entry
236247; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
237- ; GFX9-GISEL-NEXT: s_load_dword s2 , s[4:5], 0x2c
248+ ; GFX9-GISEL-NEXT: s_load_dword s6 , s[4:5], 0x2c
238249; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
239- ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
240- ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
241- ; GFX9-GISEL-NEXT: flat_load_dword v2, v[0:1] offset:16
242- ; GFX9-GISEL-NEXT: s_mov_b64 s[0:1], 0
250+ ; GFX9-GISEL-NEXT: s_add_u32 s2, s0, 16
251+ ; GFX9-GISEL-NEXT: s_addc_u32 s3, s1, 0
252+ ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2
253+ ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s3
254+ ; GFX9-GISEL-NEXT: flat_load_dword v0, v[0:1]
255+ ; GFX9-GISEL-NEXT: s_mov_b64 s[2:3], 0
243256; GFX9-GISEL-NEXT: .LBB2_1: ; %atomicrmw.start
244257; GFX9-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
245258; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
246- ; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, v2
247- ; GFX9-GISEL-NEXT: v_subrev_u32_e32 v2, s2, v3
248- ; GFX9-GISEL-NEXT: v_cmp_le_u32_e32 vcc, s2, v3
249- ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
250- ; GFX9-GISEL-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
259+ ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, v0
260+ ; GFX9-GISEL-NEXT: s_add_u32 s8, s0, 16
261+ ; GFX9-GISEL-NEXT: v_subrev_u32_e32 v0, s6, v1
262+ ; GFX9-GISEL-NEXT: s_addc_u32 s9, s1, 0
263+ ; GFX9-GISEL-NEXT: v_cmp_le_u32_e32 vcc, s6, v1
264+ ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s8
265+ ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
266+ ; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, s9
267+ ; GFX9-GISEL-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
251268; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
252269; GFX9-GISEL-NEXT: buffer_wbinvl1_vol
253- ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
254- ; GFX9-GISEL-NEXT: s_or_b64 s[0:1 ], vcc, s[0:1 ]
255- ; GFX9-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1 ]
270+ ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
271+ ; GFX9-GISEL-NEXT: s_or_b64 s[2:3 ], vcc, s[2:3 ]
272+ ; GFX9-GISEL-NEXT: s_andn2_b64 exec, exec, s[2:3 ]
256273; GFX9-GISEL-NEXT: s_cbranch_execnz .LBB2_1
257274; GFX9-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end
258- ; GFX9-GISEL-NEXT: s_or_b64 exec, exec, s[0:1 ]
275+ ; GFX9-GISEL-NEXT: s_or_b64 exec, exec, s[2:3 ]
259276; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
260277; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
261- ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
262- ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
263- ; GFX9-GISEL-NEXT: flat_store_dword v[0:1 ], v2
278+ ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s1
279+ ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s0
280+ ; GFX9-GISEL-NEXT: flat_store_dword v[1:2 ], v0
264281; GFX9-GISEL-NEXT: s_endpgm
265282;
266283; GFX12-GISEL-LABEL: flat_atomic_usub_cond_rtn_u32:
@@ -269,10 +286,14 @@ define amdgpu_kernel void @flat_atomic_usub_cond_rtn_u32(ptr %addr, i32 %in, ptr
269286; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
270287; GFX12-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
271288; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
272- ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
289+ ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 16
290+ ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0
291+ ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
292+ ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
293+ ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2
273294; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SYS
274295; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
275- ; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v2, v1, v0, s [0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
296+ ; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v2, v [0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
276297; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
277298; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS
278299; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
0 commit comments