Skip to content

Commit 334d0be

Browse files
authored
[AMDGPU] Support 64-bit LDS atomic fadd on gfx1250 (#152368)
1 parent cae7beb commit 334d0be

File tree

3 files changed

+53
-220
lines changed

3 files changed

+53
-220
lines changed

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1081,7 +1081,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
10811081
}
10821082

10831083
bool hasLDSFPAtomicAddF32() const { return GFX8Insts; }
1084-
bool hasLDSFPAtomicAddF64() const { return GFX90AInsts; }
1084+
bool hasLDSFPAtomicAddF64() const { return GFX90AInsts || GFX1250Insts; }
10851085

10861086
/// \returns true if the subtarget has the v_permlanex16_b32 instruction.
10871087
bool hasPermLaneX16() const { return getGeneration() >= GFX10; }

llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll

Lines changed: 32 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -2243,36 +2243,22 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr
22432243
;
22442244
; GFX1250-LABEL: local_atomic_fadd_f64_noret_pat:
22452245
; GFX1250: ; %bb.0: ; %main_body
2246+
; GFX1250-NEXT: s_mov_b32 s0, exec_lo
22462247
; GFX1250-NEXT: s_mov_b32 s1, exec_lo
2247-
; GFX1250-NEXT: s_mov_b32 s0, 0
2248-
; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0
2249-
; GFX1250-NEXT: s_mov_b32 s2, exec_lo
2248+
; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
22502249
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
22512250
; GFX1250-NEXT: v_cmpx_eq_u32_e32 0, v0
2252-
; GFX1250-NEXT: s_cbranch_execz .LBB51_3
2251+
; GFX1250-NEXT: s_cbranch_execz .LBB51_2
22532252
; GFX1250-NEXT: ; %bb.1:
2254-
; GFX1250-NEXT: s_bcnt1_i32_b32 s1, s1
2255-
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2256-
; GFX1250-NEXT: v_cvt_f64_u32_e32 v[0:1], s1
2257-
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x24
2253+
; GFX1250-NEXT: s_bcnt1_i32_b32 s0, s0
2254+
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
2255+
; GFX1250-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
2256+
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x24
22582257
; GFX1250-NEXT: s_wait_kmcnt 0x0
2259-
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
2260-
; GFX1250-NEXT: ds_load_b64 v[2:3], v4
2261-
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
2262-
; GFX1250-NEXT: .LBB51_2: ; %atomicrmw.start
2263-
; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
2264-
; GFX1250-NEXT: s_wait_dscnt 0x0
2265-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
2266-
; GFX1250-NEXT: v_add_f64_e32 v[6:7], v[2:3], v[0:1]
2267-
; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[6:7], v4, v[6:7], v[2:3]
2258+
; GFX1250-NEXT: v_dual_mul_f64 v[0:1], 4.0, v[0:1] :: v_dual_mov_b32 v2, s0
2259+
; GFX1250-NEXT: ds_add_f64 v2, v[0:1]
22682260
; GFX1250-NEXT: s_wait_dscnt 0x0
2269-
; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
2270-
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], v[6:7]
2271-
; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
2272-
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2273-
; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
2274-
; GFX1250-NEXT: s_cbranch_execnz .LBB51_2
2275-
; GFX1250-NEXT: .LBB51_3:
2261+
; GFX1250-NEXT: .LBB51_2:
22762262
; GFX1250-NEXT: s_endpgm
22772263
main_body:
22782264
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -2322,36 +2308,22 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3
23222308
;
23232309
; GFX1250-LABEL: local_atomic_fadd_f64_noret_pat_flush:
23242310
; GFX1250: ; %bb.0: ; %main_body
2311+
; GFX1250-NEXT: s_mov_b32 s0, exec_lo
23252312
; GFX1250-NEXT: s_mov_b32 s1, exec_lo
2326-
; GFX1250-NEXT: s_mov_b32 s0, 0
2327-
; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0
2328-
; GFX1250-NEXT: s_mov_b32 s2, exec_lo
2313+
; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
23292314
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
23302315
; GFX1250-NEXT: v_cmpx_eq_u32_e32 0, v0
2331-
; GFX1250-NEXT: s_cbranch_execz .LBB52_3
2316+
; GFX1250-NEXT: s_cbranch_execz .LBB52_2
23322317
; GFX1250-NEXT: ; %bb.1:
2333-
; GFX1250-NEXT: s_bcnt1_i32_b32 s1, s1
2334-
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2335-
; GFX1250-NEXT: v_cvt_f64_u32_e32 v[0:1], s1
2336-
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x24
2318+
; GFX1250-NEXT: s_bcnt1_i32_b32 s0, s0
2319+
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
2320+
; GFX1250-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
2321+
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x24
23372322
; GFX1250-NEXT: s_wait_kmcnt 0x0
2338-
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
2339-
; GFX1250-NEXT: ds_load_b64 v[2:3], v4
2340-
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
2341-
; GFX1250-NEXT: .LBB52_2: ; %atomicrmw.start
2342-
; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
2323+
; GFX1250-NEXT: v_dual_mul_f64 v[0:1], 4.0, v[0:1] :: v_dual_mov_b32 v2, s0
2324+
; GFX1250-NEXT: ds_add_f64 v2, v[0:1]
23432325
; GFX1250-NEXT: s_wait_dscnt 0x0
2344-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
2345-
; GFX1250-NEXT: v_add_f64_e32 v[6:7], v[2:3], v[0:1]
2346-
; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[6:7], v4, v[6:7], v[2:3]
2347-
; GFX1250-NEXT: s_wait_dscnt 0x0
2348-
; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
2349-
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], v[6:7]
2350-
; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
2351-
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2352-
; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
2353-
; GFX1250-NEXT: s_cbranch_execnz .LBB52_2
2354-
; GFX1250-NEXT: .LBB52_3:
2326+
; GFX1250-NEXT: .LBB52_2:
23552327
; GFX1250-NEXT: s_endpgm
23562328
main_body:
23572329
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -2401,36 +2373,22 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
24012373
;
24022374
; GFX1250-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe:
24032375
; GFX1250: ; %bb.0: ; %main_body
2376+
; GFX1250-NEXT: s_mov_b32 s0, exec_lo
24042377
; GFX1250-NEXT: s_mov_b32 s1, exec_lo
2405-
; GFX1250-NEXT: s_mov_b32 s0, 0
2406-
; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0
2407-
; GFX1250-NEXT: s_mov_b32 s2, exec_lo
2378+
; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
24082379
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
24092380
; GFX1250-NEXT: v_cmpx_eq_u32_e32 0, v0
2410-
; GFX1250-NEXT: s_cbranch_execz .LBB53_3
2381+
; GFX1250-NEXT: s_cbranch_execz .LBB53_2
24112382
; GFX1250-NEXT: ; %bb.1:
2412-
; GFX1250-NEXT: s_bcnt1_i32_b32 s1, s1
2413-
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2414-
; GFX1250-NEXT: v_cvt_f64_u32_e32 v[0:1], s1
2415-
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x24
2383+
; GFX1250-NEXT: s_bcnt1_i32_b32 s0, s0
2384+
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
2385+
; GFX1250-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
2386+
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x24
24162387
; GFX1250-NEXT: s_wait_kmcnt 0x0
2417-
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
2418-
; GFX1250-NEXT: ds_load_b64 v[2:3], v4
2419-
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
2420-
; GFX1250-NEXT: .LBB53_2: ; %atomicrmw.start
2421-
; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
2422-
; GFX1250-NEXT: s_wait_dscnt 0x0
2423-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
2424-
; GFX1250-NEXT: v_add_f64_e32 v[6:7], v[2:3], v[0:1]
2425-
; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[6:7], v4, v[6:7], v[2:3]
2388+
; GFX1250-NEXT: v_dual_mul_f64 v[0:1], 4.0, v[0:1] :: v_dual_mov_b32 v2, s0
2389+
; GFX1250-NEXT: ds_add_f64 v2, v[0:1]
24262390
; GFX1250-NEXT: s_wait_dscnt 0x0
2427-
; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
2428-
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], v[6:7]
2429-
; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
2430-
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2431-
; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
2432-
; GFX1250-NEXT: s_cbranch_execnz .LBB53_2
2433-
; GFX1250-NEXT: .LBB53_3:
2391+
; GFX1250-NEXT: .LBB53_2:
24342392
; GFX1250-NEXT: s_endpgm
24352393
main_body:
24362394
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -2459,23 +2417,9 @@ define double @local_atomic_fadd_f64_rtn_pat(ptr addrspace(3) %ptr, double %data
24592417
; GFX1250: ; %bb.0: ; %main_body
24602418
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
24612419
; GFX1250-NEXT: s_wait_kmcnt 0x0
2462-
; GFX1250-NEXT: v_mov_b32_e32 v2, v0
2463-
; GFX1250-NEXT: ds_load_b64 v[0:1], v0
2464-
; GFX1250-NEXT: s_mov_b32 s0, 0
2465-
; GFX1250-NEXT: .LBB54_1: ; %atomicrmw.start
2466-
; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
2467-
; GFX1250-NEXT: s_wait_dscnt 0x0
2468-
; GFX1250-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
2469-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
2470-
; GFX1250-NEXT: v_add_f64_e32 v[0:1], 4.0, v[4:5]
2471-
; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[4:5]
2420+
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
2421+
; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
24722422
; GFX1250-NEXT: s_wait_dscnt 0x0
2473-
; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[4:5]
2474-
; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
2475-
; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
2476-
; GFX1250-NEXT: s_cbranch_execnz .LBB54_1
2477-
; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
2478-
; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
24792423
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
24802424
main_body:
24812425
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst

llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll

Lines changed: 20 additions & 131 deletions
Original file line numberDiff line numberDiff line change
@@ -2102,23 +2102,10 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret(ptr addrspace(3) %ptr, do
21022102
; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x24
21032103
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
21042104
; GFX1250-NEXT: s_wait_kmcnt 0x0
2105-
; GFX1250-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v2, s2
2106-
; GFX1250-NEXT: s_mov_b32 s2, 0
2107-
; GFX1250-NEXT: ds_load_b64 v[0:1], v0
2108-
; GFX1250-NEXT: .LBB51_1: ; %atomicrmw.start
2109-
; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
2105+
; GFX1250-NEXT: v_mov_b32_e32 v2, s2
2106+
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
2107+
; GFX1250-NEXT: ds_add_f64 v2, v[0:1]
21102108
; GFX1250-NEXT: s_wait_dscnt 0x0
2111-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
2112-
; GFX1250-NEXT: v_add_f64_e32 v[4:5], s[0:1], v[0:1]
2113-
; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[4:5], v2, v[4:5], v[0:1]
2114-
; GFX1250-NEXT: s_wait_dscnt 0x0
2115-
; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[0:1]
2116-
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
2117-
; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2
2118-
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2119-
; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
2120-
; GFX1250-NEXT: s_cbranch_execnz .LBB51_1
2121-
; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
21222109
; GFX1250-NEXT: s_endpgm
21232110
main_body:
21242111
%ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0)
@@ -2148,24 +2135,9 @@ define double @local_atomic_fadd_f64_rtn(ptr addrspace(3) %ptr, double %data) {
21482135
; GFX1250: ; %bb.0: ; %main_body
21492136
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
21502137
; GFX1250-NEXT: s_wait_kmcnt 0x0
2151-
; GFX1250-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v2, v0
2152-
; GFX1250-NEXT: v_mov_b32_e32 v4, v1
2153-
; GFX1250-NEXT: ds_load_b64 v[0:1], v0
2154-
; GFX1250-NEXT: s_mov_b32 s0, 0
2155-
; GFX1250-NEXT: .LBB52_1: ; %atomicrmw.start
2156-
; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
2157-
; GFX1250-NEXT: s_wait_dscnt 0x0
2158-
; GFX1250-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
2159-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
2160-
; GFX1250-NEXT: v_add_f64_e32 v[0:1], v[6:7], v[4:5]
2161-
; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[6:7]
2138+
; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
2139+
; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
21622140
; GFX1250-NEXT: s_wait_dscnt 0x0
2163-
; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7]
2164-
; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
2165-
; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
2166-
; GFX1250-NEXT: s_cbranch_execnz .LBB52_1
2167-
; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
2168-
; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
21692141
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
21702142
main_body:
21712143
%ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0)
@@ -2197,24 +2169,11 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr
21972169
; GFX1250-LABEL: local_atomic_fadd_f64_noret_pat:
21982170
; GFX1250: ; %bb.0: ; %main_body
21992171
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x24
2172+
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
22002173
; GFX1250-NEXT: s_wait_kmcnt 0x0
2201-
; GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v2, s0
2202-
; GFX1250-NEXT: s_mov_b32 s0, 0
2203-
; GFX1250-NEXT: ds_load_b64 v[0:1], v0
2204-
; GFX1250-NEXT: .LBB53_1: ; %atomicrmw.start
2205-
; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
2174+
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
2175+
; GFX1250-NEXT: ds_add_f64 v2, v[0:1]
22062176
; GFX1250-NEXT: s_wait_dscnt 0x0
2207-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
2208-
; GFX1250-NEXT: v_add_f64_e32 v[4:5], 4.0, v[0:1]
2209-
; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[4:5], v2, v[4:5], v[0:1]
2210-
; GFX1250-NEXT: s_wait_dscnt 0x0
2211-
; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[0:1]
2212-
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
2213-
; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
2214-
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2215-
; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
2216-
; GFX1250-NEXT: s_cbranch_execnz .LBB53_1
2217-
; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
22182177
; GFX1250-NEXT: s_endpgm
22192178
main_body:
22202179
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -2246,24 +2205,11 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3
22462205
; GFX1250-LABEL: local_atomic_fadd_f64_noret_pat_flush:
22472206
; GFX1250: ; %bb.0: ; %main_body
22482207
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x24
2208+
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
22492209
; GFX1250-NEXT: s_wait_kmcnt 0x0
2250-
; GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v2, s0
2251-
; GFX1250-NEXT: s_mov_b32 s0, 0
2252-
; GFX1250-NEXT: ds_load_b64 v[0:1], v0
2253-
; GFX1250-NEXT: .LBB54_1: ; %atomicrmw.start
2254-
; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
2255-
; GFX1250-NEXT: s_wait_dscnt 0x0
2256-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
2257-
; GFX1250-NEXT: v_add_f64_e32 v[4:5], 4.0, v[0:1]
2258-
; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[4:5], v2, v[4:5], v[0:1]
2210+
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
2211+
; GFX1250-NEXT: ds_add_f64 v2, v[0:1]
22592212
; GFX1250-NEXT: s_wait_dscnt 0x0
2260-
; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[0:1]
2261-
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
2262-
; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
2263-
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2264-
; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
2265-
; GFX1250-NEXT: s_cbranch_execnz .LBB54_1
2266-
; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
22672213
; GFX1250-NEXT: s_endpgm
22682214
main_body:
22692215
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -2295,24 +2241,11 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
22952241
; GFX1250-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe:
22962242
; GFX1250: ; %bb.0: ; %main_body
22972243
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x24
2244+
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
22982245
; GFX1250-NEXT: s_wait_kmcnt 0x0
2299-
; GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v2, s0
2300-
; GFX1250-NEXT: s_mov_b32 s0, 0
2301-
; GFX1250-NEXT: ds_load_b64 v[0:1], v0
2302-
; GFX1250-NEXT: .LBB55_1: ; %atomicrmw.start
2303-
; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
2304-
; GFX1250-NEXT: s_wait_dscnt 0x0
2305-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
2306-
; GFX1250-NEXT: v_add_f64_e32 v[4:5], 4.0, v[0:1]
2307-
; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[4:5], v2, v[4:5], v[0:1]
2246+
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
2247+
; GFX1250-NEXT: ds_add_f64 v2, v[0:1]
23082248
; GFX1250-NEXT: s_wait_dscnt 0x0
2309-
; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[0:1]
2310-
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
2311-
; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
2312-
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2313-
; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
2314-
; GFX1250-NEXT: s_cbranch_execnz .LBB55_1
2315-
; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
23162249
; GFX1250-NEXT: s_endpgm
23172250
main_body:
23182251
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst
@@ -2341,23 +2274,9 @@ define double @local_atomic_fadd_f64_rtn_pat(ptr addrspace(3) %ptr, double %data
23412274
; GFX1250: ; %bb.0: ; %main_body
23422275
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
23432276
; GFX1250-NEXT: s_wait_kmcnt 0x0
2344-
; GFX1250-NEXT: v_mov_b32_e32 v2, v0
2345-
; GFX1250-NEXT: ds_load_b64 v[0:1], v0
2346-
; GFX1250-NEXT: s_mov_b32 s0, 0
2347-
; GFX1250-NEXT: .LBB56_1: ; %atomicrmw.start
2348-
; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
2349-
; GFX1250-NEXT: s_wait_dscnt 0x0
2350-
; GFX1250-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
2351-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
2352-
; GFX1250-NEXT: v_add_f64_e32 v[0:1], 4.0, v[4:5]
2353-
; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[4:5]
2277+
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
2278+
; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
23542279
; GFX1250-NEXT: s_wait_dscnt 0x0
2355-
; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[4:5]
2356-
; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
2357-
; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
2358-
; GFX1250-NEXT: s_cbranch_execnz .LBB56_1
2359-
; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
2360-
; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
23612280
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
23622281
main_body:
23632282
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -2387,24 +2306,9 @@ define double @local_atomic_fadd_f64_rtn_ieee_unsafe(ptr addrspace(3) %ptr, doub
23872306
; GFX1250: ; %bb.0: ; %main_body
23882307
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
23892308
; GFX1250-NEXT: s_wait_kmcnt 0x0
2390-
; GFX1250-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v2, v0
2391-
; GFX1250-NEXT: v_mov_b32_e32 v4, v1
2392-
; GFX1250-NEXT: ds_load_b64 v[0:1], v0
2393-
; GFX1250-NEXT: s_mov_b32 s0, 0
2394-
; GFX1250-NEXT: .LBB57_1: ; %atomicrmw.start
2395-
; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
2396-
; GFX1250-NEXT: s_wait_dscnt 0x0
2397-
; GFX1250-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
2398-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
2399-
; GFX1250-NEXT: v_add_f64_e32 v[0:1], v[6:7], v[4:5]
2400-
; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[6:7]
2309+
; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
2310+
; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
24012311
; GFX1250-NEXT: s_wait_dscnt 0x0
2402-
; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7]
2403-
; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
2404-
; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
2405-
; GFX1250-NEXT: s_cbranch_execnz .LBB57_1
2406-
; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
2407-
; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
24082312
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
24092313
main_body:
24102314
%ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0)
@@ -2434,24 +2338,9 @@ define double @local_atomic_fadd_f64_rtn_ieee_safe(ptr addrspace(3) %ptr, double
24342338
; GFX1250: ; %bb.0: ; %main_body
24352339
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
24362340
; GFX1250-NEXT: s_wait_kmcnt 0x0
2437-
; GFX1250-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v2, v0
2438-
; GFX1250-NEXT: v_mov_b32_e32 v4, v1
2439-
; GFX1250-NEXT: ds_load_b64 v[0:1], v0
2440-
; GFX1250-NEXT: s_mov_b32 s0, 0
2441-
; GFX1250-NEXT: .LBB58_1: ; %atomicrmw.start
2442-
; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
2443-
; GFX1250-NEXT: s_wait_dscnt 0x0
2444-
; GFX1250-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
2445-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
2446-
; GFX1250-NEXT: v_add_f64_e32 v[0:1], v[6:7], v[4:5]
2447-
; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[6:7]
2341+
; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
2342+
; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
24482343
; GFX1250-NEXT: s_wait_dscnt 0x0
2449-
; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7]
2450-
; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
2451-
; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
2452-
; GFX1250-NEXT: s_cbranch_execnz .LBB58_1
2453-
; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
2454-
; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
24552344
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
24562345
main_body:
24572346
%ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0)

0 commit comments

Comments
 (0)