@@ -889,19 +889,19 @@ define void @global_atomic_usub_cond_offset_nortn(ptr addrspace(1) %ptr, i32 %da
889889 ret void
890890}
891891
892- define amdgpu_kernel void @global_atomic_usub_cond_sgpr_base_offset (ptr addrspace (1 ) %ptr , i32 %data ) {
892+ define amdgpu_kernel void @global_atomic_usub_cond_sgpr_base_offset (ptr addrspace (1 ) %ptr , i32 %data , ptr addrspace ( 1 ) %dst ) {
893893; GFX9-SDAG-LABEL: global_atomic_usub_cond_sgpr_base_offset:
894894; GFX9-SDAG: ; %bb.0:
895895; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
896896; GFX9-SDAG-NEXT: s_load_dword s6, s[4:5], 0x2c
897897; GFX9-SDAG-NEXT: s_mov_b64 s[0:1], 0
898898; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0
899899; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
900- ; GFX9-SDAG-NEXT: s_load_dword s4 , s[2:3], 0x1000
900+ ; GFX9-SDAG-NEXT: s_load_dword s7 , s[2:3], 0x1000
901901; GFX9-SDAG-NEXT: s_add_u32 s2, s2, 0x1000
902902; GFX9-SDAG-NEXT: s_addc_u32 s3, s3, 0
903903; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
904- ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s4
904+ ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s7
905905; GFX9-SDAG-NEXT: .LBB10_1: ; %atomicrmw.start
906906; GFX9-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
907907; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, v1
@@ -917,18 +917,23 @@ define amdgpu_kernel void @global_atomic_usub_cond_sgpr_base_offset(ptr addrspac
917917; GFX9-SDAG-NEXT: s_cbranch_execnz .LBB10_1
918918; GFX9-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end
919919; GFX9-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
920- ; GFX9-SDAG-NEXT: global_store_dword v[0:1], v1, off
920+ ; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
921+ ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0
922+ ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
923+ ; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
921924; GFX9-SDAG-NEXT: s_endpgm
922925;
923926; GFX12-SDAG-LABEL: global_atomic_usub_cond_sgpr_base_offset:
924927; GFX12-SDAG: ; %bb.0:
925928; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
929+ ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
930+ ; GFX12-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
926931; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
927- ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
928- ; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0 , v0, v1, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
932+ ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2
933+ ; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v1 , v0, v1, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
929934; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
930935; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
931- ; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v0, off
936+ ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
932937; GFX12-SDAG-NEXT: s_endpgm
933938;
934939; GFX9-GISEL-LABEL: global_atomic_usub_cond_sgpr_base_offset:
@@ -938,9 +943,9 @@ define amdgpu_kernel void @global_atomic_usub_cond_sgpr_base_offset(ptr addrspac
938943; GFX9-GISEL-NEXT: s_mov_b64 s[2:3], 0
939944; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0x1000
940945; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
941- ; GFX9-GISEL-NEXT: s_load_dword s4 , s[0:1], 0x1000
946+ ; GFX9-GISEL-NEXT: s_load_dword s7 , s[0:1], 0x1000
942947; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
943- ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4
948+ ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s7
944949; GFX9-GISEL-NEXT: .LBB10_1: ; %atomicrmw.start
945950; GFX9-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
946951; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, v1
@@ -956,22 +961,27 @@ define amdgpu_kernel void @global_atomic_usub_cond_sgpr_base_offset(ptr addrspac
956961; GFX9-GISEL-NEXT: s_cbranch_execnz .LBB10_1
957962; GFX9-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end
958963; GFX9-GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
959- ; GFX9-GISEL-NEXT: global_store_dword v[0:1], v1, off
964+ ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
965+ ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
966+ ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
967+ ; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
960968; GFX9-GISEL-NEXT: s_endpgm
961969;
962970; GFX12-GISEL-LABEL: global_atomic_usub_cond_sgpr_base_offset:
963971; GFX12-GISEL: ; %bb.0:
972+ ; GFX12-GISEL-NEXT: s_clause 0x1
964973; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
974+ ; GFX12-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
965975; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
966976; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
967977; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
968978; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
969979; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
970- ; GFX12-GISEL-NEXT: global_store_b32 v[0:1] , v0, off
980+ ; GFX12-GISEL-NEXT: global_store_b32 v1 , v0, s[4:5]
971981; GFX12-GISEL-NEXT: s_endpgm
972982 %gep = getelementptr i32 , ptr addrspace (1 ) %ptr , i64 1024
973983 %ret = atomicrmw usub_cond ptr addrspace (1 ) %gep , i32 %data syncscope("agent" ) seq_cst , align 4
974- store i32 %ret , ptr addrspace (1 ) undef
984+ store i32 %ret , ptr addrspace (1 ) %dst
975985 ret void
976986}
977987
0 commit comments