@@ -2102,23 +2102,10 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret(ptr addrspace(3) %ptr, do
21022102; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x24
21032103; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
21042104; GFX1250-NEXT: s_wait_kmcnt 0x0
2105- ; GFX1250-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v2, s2
2106- ; GFX1250-NEXT: s_mov_b32 s2, 0
2107- ; GFX1250-NEXT: ds_load_b64 v[0:1], v0
2108- ; GFX1250-NEXT: .LBB51_1: ; %atomicrmw.start
2109- ; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
2105+ ; GFX1250-NEXT: v_mov_b32_e32 v2, s2
2106+ ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
2107+ ; GFX1250-NEXT: ds_add_f64 v2, v[0:1]
21102108; GFX1250-NEXT: s_wait_dscnt 0x0
2111- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
2112- ; GFX1250-NEXT: v_add_f64_e32 v[4:5], s[0:1], v[0:1]
2113- ; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[4:5], v2, v[4:5], v[0:1]
2114- ; GFX1250-NEXT: s_wait_dscnt 0x0
2115- ; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[0:1]
2116- ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
2117- ; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2
2118- ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2119- ; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
2120- ; GFX1250-NEXT: s_cbranch_execnz .LBB51_1
2121- ; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
21222109; GFX1250-NEXT: s_endpgm
21232110main_body:
21242111 %ret = call double @llvm.amdgcn.ds.fadd.f64 (ptr addrspace (3 ) %ptr , double %data , i32 0 , i32 0 , i1 0 )
@@ -2148,24 +2135,9 @@ define double @local_atomic_fadd_f64_rtn(ptr addrspace(3) %ptr, double %data) {
21482135; GFX1250: ; %bb.0: ; %main_body
21492136; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
21502137; GFX1250-NEXT: s_wait_kmcnt 0x0
2151- ; GFX1250-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v2, v0
2152- ; GFX1250-NEXT: v_mov_b32_e32 v4, v1
2153- ; GFX1250-NEXT: ds_load_b64 v[0:1], v0
2154- ; GFX1250-NEXT: s_mov_b32 s0, 0
2155- ; GFX1250-NEXT: .LBB52_1: ; %atomicrmw.start
2156- ; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
2157- ; GFX1250-NEXT: s_wait_dscnt 0x0
2158- ; GFX1250-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
2159- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
2160- ; GFX1250-NEXT: v_add_f64_e32 v[0:1], v[6:7], v[4:5]
2161- ; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[6:7]
2138+ ; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
2139+ ; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
21622140; GFX1250-NEXT: s_wait_dscnt 0x0
2163- ; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7]
2164- ; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
2165- ; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
2166- ; GFX1250-NEXT: s_cbranch_execnz .LBB52_1
2167- ; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
2168- ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
21692141; GFX1250-NEXT: s_set_pc_i64 s[30:31]
21702142main_body:
21712143 %ret = call double @llvm.amdgcn.ds.fadd.f64 (ptr addrspace (3 ) %ptr , double %data , i32 0 , i32 0 , i1 0 )
@@ -2197,24 +2169,11 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr
21972169; GFX1250-LABEL: local_atomic_fadd_f64_noret_pat:
21982170; GFX1250: ; %bb.0: ; %main_body
21992171; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x24
2172+ ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
22002173; GFX1250-NEXT: s_wait_kmcnt 0x0
2201- ; GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v2, s0
2202- ; GFX1250-NEXT: s_mov_b32 s0, 0
2203- ; GFX1250-NEXT: ds_load_b64 v[0:1], v0
2204- ; GFX1250-NEXT: .LBB53_1: ; %atomicrmw.start
2205- ; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
2174+ ; GFX1250-NEXT: v_mov_b32_e32 v2, s0
2175+ ; GFX1250-NEXT: ds_add_f64 v2, v[0:1]
22062176; GFX1250-NEXT: s_wait_dscnt 0x0
2207- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
2208- ; GFX1250-NEXT: v_add_f64_e32 v[4:5], 4.0, v[0:1]
2209- ; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[4:5], v2, v[4:5], v[0:1]
2210- ; GFX1250-NEXT: s_wait_dscnt 0x0
2211- ; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[0:1]
2212- ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
2213- ; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
2214- ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2215- ; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
2216- ; GFX1250-NEXT: s_cbranch_execnz .LBB53_1
2217- ; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
22182177; GFX1250-NEXT: s_endpgm
22192178main_body:
22202179 %ret = atomicrmw fadd ptr addrspace (3 ) %ptr , double 4 .0 seq_cst , !amdgpu.no.fine.grained.memory !0
@@ -2246,24 +2205,11 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3
22462205; GFX1250-LABEL: local_atomic_fadd_f64_noret_pat_flush:
22472206; GFX1250: ; %bb.0: ; %main_body
22482207; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x24
2208+ ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
22492209; GFX1250-NEXT: s_wait_kmcnt 0x0
2250- ; GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v2, s0
2251- ; GFX1250-NEXT: s_mov_b32 s0, 0
2252- ; GFX1250-NEXT: ds_load_b64 v[0:1], v0
2253- ; GFX1250-NEXT: .LBB54_1: ; %atomicrmw.start
2254- ; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
2255- ; GFX1250-NEXT: s_wait_dscnt 0x0
2256- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
2257- ; GFX1250-NEXT: v_add_f64_e32 v[4:5], 4.0, v[0:1]
2258- ; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[4:5], v2, v[4:5], v[0:1]
2210+ ; GFX1250-NEXT: v_mov_b32_e32 v2, s0
2211+ ; GFX1250-NEXT: ds_add_f64 v2, v[0:1]
22592212; GFX1250-NEXT: s_wait_dscnt 0x0
2260- ; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[0:1]
2261- ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
2262- ; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
2263- ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2264- ; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
2265- ; GFX1250-NEXT: s_cbranch_execnz .LBB54_1
2266- ; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
22672213; GFX1250-NEXT: s_endpgm
22682214main_body:
22692215 %ret = atomicrmw fadd ptr addrspace (3 ) %ptr , double 4 .0 seq_cst , !amdgpu.no.fine.grained.memory !0
@@ -2295,24 +2241,11 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
22952241; GFX1250-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe:
22962242; GFX1250: ; %bb.0: ; %main_body
22972243; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x24
2244+ ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
22982245; GFX1250-NEXT: s_wait_kmcnt 0x0
2299- ; GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v2, s0
2300- ; GFX1250-NEXT: s_mov_b32 s0, 0
2301- ; GFX1250-NEXT: ds_load_b64 v[0:1], v0
2302- ; GFX1250-NEXT: .LBB55_1: ; %atomicrmw.start
2303- ; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
2304- ; GFX1250-NEXT: s_wait_dscnt 0x0
2305- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
2306- ; GFX1250-NEXT: v_add_f64_e32 v[4:5], 4.0, v[0:1]
2307- ; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[4:5], v2, v[4:5], v[0:1]
2246+ ; GFX1250-NEXT: v_mov_b32_e32 v2, s0
2247+ ; GFX1250-NEXT: ds_add_f64 v2, v[0:1]
23082248; GFX1250-NEXT: s_wait_dscnt 0x0
2309- ; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[0:1]
2310- ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
2311- ; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
2312- ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2313- ; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
2314- ; GFX1250-NEXT: s_cbranch_execnz .LBB55_1
2315- ; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
23162249; GFX1250-NEXT: s_endpgm
23172250main_body:
23182251 %ret = atomicrmw fadd ptr addrspace (3 ) %ptr , double 4 .0 seq_cst
@@ -2341,23 +2274,9 @@ define double @local_atomic_fadd_f64_rtn_pat(ptr addrspace(3) %ptr, double %data
23412274; GFX1250: ; %bb.0: ; %main_body
23422275; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
23432276; GFX1250-NEXT: s_wait_kmcnt 0x0
2344- ; GFX1250-NEXT: v_mov_b32_e32 v2, v0
2345- ; GFX1250-NEXT: ds_load_b64 v[0:1], v0
2346- ; GFX1250-NEXT: s_mov_b32 s0, 0
2347- ; GFX1250-NEXT: .LBB56_1: ; %atomicrmw.start
2348- ; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
2349- ; GFX1250-NEXT: s_wait_dscnt 0x0
2350- ; GFX1250-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
2351- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
2352- ; GFX1250-NEXT: v_add_f64_e32 v[0:1], 4.0, v[4:5]
2353- ; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[4:5]
2277+ ; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
2278+ ; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
23542279; GFX1250-NEXT: s_wait_dscnt 0x0
2355- ; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[4:5]
2356- ; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
2357- ; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
2358- ; GFX1250-NEXT: s_cbranch_execnz .LBB56_1
2359- ; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
2360- ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
23612280; GFX1250-NEXT: s_set_pc_i64 s[30:31]
23622281main_body:
23632282 %ret = atomicrmw fadd ptr addrspace (3 ) %ptr , double 4 .0 seq_cst , !amdgpu.no.fine.grained.memory !0
@@ -2387,24 +2306,9 @@ define double @local_atomic_fadd_f64_rtn_ieee_unsafe(ptr addrspace(3) %ptr, doub
23872306; GFX1250: ; %bb.0: ; %main_body
23882307; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
23892308; GFX1250-NEXT: s_wait_kmcnt 0x0
2390- ; GFX1250-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v2, v0
2391- ; GFX1250-NEXT: v_mov_b32_e32 v4, v1
2392- ; GFX1250-NEXT: ds_load_b64 v[0:1], v0
2393- ; GFX1250-NEXT: s_mov_b32 s0, 0
2394- ; GFX1250-NEXT: .LBB57_1: ; %atomicrmw.start
2395- ; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
2396- ; GFX1250-NEXT: s_wait_dscnt 0x0
2397- ; GFX1250-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
2398- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
2399- ; GFX1250-NEXT: v_add_f64_e32 v[0:1], v[6:7], v[4:5]
2400- ; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[6:7]
2309+ ; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
2310+ ; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
24012311; GFX1250-NEXT: s_wait_dscnt 0x0
2402- ; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7]
2403- ; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
2404- ; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
2405- ; GFX1250-NEXT: s_cbranch_execnz .LBB57_1
2406- ; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
2407- ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
24082312; GFX1250-NEXT: s_set_pc_i64 s[30:31]
24092313main_body:
24102314 %ret = call double @llvm.amdgcn.ds.fadd.f64 (ptr addrspace (3 ) %ptr , double %data , i32 0 , i32 0 , i1 0 )
@@ -2434,24 +2338,9 @@ define double @local_atomic_fadd_f64_rtn_ieee_safe(ptr addrspace(3) %ptr, double
24342338; GFX1250: ; %bb.0: ; %main_body
24352339; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
24362340; GFX1250-NEXT: s_wait_kmcnt 0x0
2437- ; GFX1250-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v2, v0
2438- ; GFX1250-NEXT: v_mov_b32_e32 v4, v1
2439- ; GFX1250-NEXT: ds_load_b64 v[0:1], v0
2440- ; GFX1250-NEXT: s_mov_b32 s0, 0
2441- ; GFX1250-NEXT: .LBB58_1: ; %atomicrmw.start
2442- ; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
2443- ; GFX1250-NEXT: s_wait_dscnt 0x0
2444- ; GFX1250-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
2445- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
2446- ; GFX1250-NEXT: v_add_f64_e32 v[0:1], v[6:7], v[4:5]
2447- ; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[6:7]
2341+ ; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
2342+ ; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
24482343; GFX1250-NEXT: s_wait_dscnt 0x0
2449- ; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7]
2450- ; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
2451- ; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
2452- ; GFX1250-NEXT: s_cbranch_execnz .LBB58_1
2453- ; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
2454- ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
24552344; GFX1250-NEXT: s_set_pc_i64 s[30:31]
24562345main_body:
24572346 %ret = call double @llvm.amdgcn.ds.fadd.f64 (ptr addrspace (3 ) %ptr , double %data , i32 0 , i32 0 , i1 0 )
0 commit comments