@@ -2102,23 +2102,10 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret(ptr addrspace(3) %ptr, do
2102
2102
; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x24
2103
2103
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
2104
2104
; GFX1250-NEXT: s_wait_kmcnt 0x0
2105
- ; GFX1250-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v2, s2
2106
- ; GFX1250-NEXT: s_mov_b32 s2, 0
2107
- ; GFX1250-NEXT: ds_load_b64 v[0:1], v0
2108
- ; GFX1250-NEXT: .LBB51_1: ; %atomicrmw.start
2109
- ; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
2105
+ ; GFX1250-NEXT: v_mov_b32_e32 v2, s2
2106
+ ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
2107
+ ; GFX1250-NEXT: ds_add_f64 v2, v[0:1]
2110
2108
; GFX1250-NEXT: s_wait_dscnt 0x0
2111
- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
2112
- ; GFX1250-NEXT: v_add_f64_e32 v[4:5], s[0:1], v[0:1]
2113
- ; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[4:5], v2, v[4:5], v[0:1]
2114
- ; GFX1250-NEXT: s_wait_dscnt 0x0
2115
- ; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[0:1]
2116
- ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
2117
- ; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2
2118
- ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2119
- ; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
2120
- ; GFX1250-NEXT: s_cbranch_execnz .LBB51_1
2121
- ; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
2122
2109
; GFX1250-NEXT: s_endpgm
2123
2110
main_body:
2124
2111
%ret = call double @llvm.amdgcn.ds.fadd.f64 (ptr addrspace (3 ) %ptr , double %data , i32 0 , i32 0 , i1 0 )
@@ -2148,24 +2135,9 @@ define double @local_atomic_fadd_f64_rtn(ptr addrspace(3) %ptr, double %data) {
2148
2135
; GFX1250: ; %bb.0: ; %main_body
2149
2136
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
2150
2137
; GFX1250-NEXT: s_wait_kmcnt 0x0
2151
- ; GFX1250-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v2, v0
2152
- ; GFX1250-NEXT: v_mov_b32_e32 v4, v1
2153
- ; GFX1250-NEXT: ds_load_b64 v[0:1], v0
2154
- ; GFX1250-NEXT: s_mov_b32 s0, 0
2155
- ; GFX1250-NEXT: .LBB52_1: ; %atomicrmw.start
2156
- ; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
2157
- ; GFX1250-NEXT: s_wait_dscnt 0x0
2158
- ; GFX1250-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
2159
- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
2160
- ; GFX1250-NEXT: v_add_f64_e32 v[0:1], v[6:7], v[4:5]
2161
- ; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[6:7]
2138
+ ; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
2139
+ ; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
2162
2140
; GFX1250-NEXT: s_wait_dscnt 0x0
2163
- ; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7]
2164
- ; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
2165
- ; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
2166
- ; GFX1250-NEXT: s_cbranch_execnz .LBB52_1
2167
- ; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
2168
- ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
2169
2141
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
2170
2142
main_body:
2171
2143
%ret = call double @llvm.amdgcn.ds.fadd.f64 (ptr addrspace (3 ) %ptr , double %data , i32 0 , i32 0 , i1 0 )
@@ -2197,24 +2169,11 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr
2197
2169
; GFX1250-LABEL: local_atomic_fadd_f64_noret_pat:
2198
2170
; GFX1250: ; %bb.0: ; %main_body
2199
2171
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x24
2172
+ ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
2200
2173
; GFX1250-NEXT: s_wait_kmcnt 0x0
2201
- ; GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v2, s0
2202
- ; GFX1250-NEXT: s_mov_b32 s0, 0
2203
- ; GFX1250-NEXT: ds_load_b64 v[0:1], v0
2204
- ; GFX1250-NEXT: .LBB53_1: ; %atomicrmw.start
2205
- ; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
2174
+ ; GFX1250-NEXT: v_mov_b32_e32 v2, s0
2175
+ ; GFX1250-NEXT: ds_add_f64 v2, v[0:1]
2206
2176
; GFX1250-NEXT: s_wait_dscnt 0x0
2207
- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
2208
- ; GFX1250-NEXT: v_add_f64_e32 v[4:5], 4.0, v[0:1]
2209
- ; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[4:5], v2, v[4:5], v[0:1]
2210
- ; GFX1250-NEXT: s_wait_dscnt 0x0
2211
- ; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[0:1]
2212
- ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
2213
- ; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
2214
- ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2215
- ; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
2216
- ; GFX1250-NEXT: s_cbranch_execnz .LBB53_1
2217
- ; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
2218
2177
; GFX1250-NEXT: s_endpgm
2219
2178
main_body:
2220
2179
%ret = atomicrmw fadd ptr addrspace (3 ) %ptr , double 4 .0 seq_cst , !amdgpu.no.fine.grained.memory !0
@@ -2246,24 +2205,11 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3
2246
2205
; GFX1250-LABEL: local_atomic_fadd_f64_noret_pat_flush:
2247
2206
; GFX1250: ; %bb.0: ; %main_body
2248
2207
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x24
2208
+ ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
2249
2209
; GFX1250-NEXT: s_wait_kmcnt 0x0
2250
- ; GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v2, s0
2251
- ; GFX1250-NEXT: s_mov_b32 s0, 0
2252
- ; GFX1250-NEXT: ds_load_b64 v[0:1], v0
2253
- ; GFX1250-NEXT: .LBB54_1: ; %atomicrmw.start
2254
- ; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
2255
- ; GFX1250-NEXT: s_wait_dscnt 0x0
2256
- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
2257
- ; GFX1250-NEXT: v_add_f64_e32 v[4:5], 4.0, v[0:1]
2258
- ; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[4:5], v2, v[4:5], v[0:1]
2210
+ ; GFX1250-NEXT: v_mov_b32_e32 v2, s0
2211
+ ; GFX1250-NEXT: ds_add_f64 v2, v[0:1]
2259
2212
; GFX1250-NEXT: s_wait_dscnt 0x0
2260
- ; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[0:1]
2261
- ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
2262
- ; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
2263
- ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2264
- ; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
2265
- ; GFX1250-NEXT: s_cbranch_execnz .LBB54_1
2266
- ; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
2267
2213
; GFX1250-NEXT: s_endpgm
2268
2214
main_body:
2269
2215
%ret = atomicrmw fadd ptr addrspace (3 ) %ptr , double 4 .0 seq_cst , !amdgpu.no.fine.grained.memory !0
@@ -2295,24 +2241,11 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
2295
2241
; GFX1250-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe:
2296
2242
; GFX1250: ; %bb.0: ; %main_body
2297
2243
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x24
2244
+ ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
2298
2245
; GFX1250-NEXT: s_wait_kmcnt 0x0
2299
- ; GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v2, s0
2300
- ; GFX1250-NEXT: s_mov_b32 s0, 0
2301
- ; GFX1250-NEXT: ds_load_b64 v[0:1], v0
2302
- ; GFX1250-NEXT: .LBB55_1: ; %atomicrmw.start
2303
- ; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
2304
- ; GFX1250-NEXT: s_wait_dscnt 0x0
2305
- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
2306
- ; GFX1250-NEXT: v_add_f64_e32 v[4:5], 4.0, v[0:1]
2307
- ; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[4:5], v2, v[4:5], v[0:1]
2246
+ ; GFX1250-NEXT: v_mov_b32_e32 v2, s0
2247
+ ; GFX1250-NEXT: ds_add_f64 v2, v[0:1]
2308
2248
; GFX1250-NEXT: s_wait_dscnt 0x0
2309
- ; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[0:1]
2310
- ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
2311
- ; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
2312
- ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2313
- ; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
2314
- ; GFX1250-NEXT: s_cbranch_execnz .LBB55_1
2315
- ; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
2316
2249
; GFX1250-NEXT: s_endpgm
2317
2250
main_body:
2318
2251
%ret = atomicrmw fadd ptr addrspace (3 ) %ptr , double 4 .0 seq_cst
@@ -2341,23 +2274,9 @@ define double @local_atomic_fadd_f64_rtn_pat(ptr addrspace(3) %ptr, double %data
2341
2274
; GFX1250: ; %bb.0: ; %main_body
2342
2275
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
2343
2276
; GFX1250-NEXT: s_wait_kmcnt 0x0
2344
- ; GFX1250-NEXT: v_mov_b32_e32 v2, v0
2345
- ; GFX1250-NEXT: ds_load_b64 v[0:1], v0
2346
- ; GFX1250-NEXT: s_mov_b32 s0, 0
2347
- ; GFX1250-NEXT: .LBB56_1: ; %atomicrmw.start
2348
- ; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
2349
- ; GFX1250-NEXT: s_wait_dscnt 0x0
2350
- ; GFX1250-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
2351
- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
2352
- ; GFX1250-NEXT: v_add_f64_e32 v[0:1], 4.0, v[4:5]
2353
- ; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[4:5]
2277
+ ; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
2278
+ ; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
2354
2279
; GFX1250-NEXT: s_wait_dscnt 0x0
2355
- ; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[4:5]
2356
- ; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
2357
- ; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
2358
- ; GFX1250-NEXT: s_cbranch_execnz .LBB56_1
2359
- ; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
2360
- ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
2361
2280
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
2362
2281
main_body:
2363
2282
%ret = atomicrmw fadd ptr addrspace (3 ) %ptr , double 4 .0 seq_cst , !amdgpu.no.fine.grained.memory !0
@@ -2387,24 +2306,9 @@ define double @local_atomic_fadd_f64_rtn_ieee_unsafe(ptr addrspace(3) %ptr, doub
2387
2306
; GFX1250: ; %bb.0: ; %main_body
2388
2307
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
2389
2308
; GFX1250-NEXT: s_wait_kmcnt 0x0
2390
- ; GFX1250-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v2, v0
2391
- ; GFX1250-NEXT: v_mov_b32_e32 v4, v1
2392
- ; GFX1250-NEXT: ds_load_b64 v[0:1], v0
2393
- ; GFX1250-NEXT: s_mov_b32 s0, 0
2394
- ; GFX1250-NEXT: .LBB57_1: ; %atomicrmw.start
2395
- ; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
2396
- ; GFX1250-NEXT: s_wait_dscnt 0x0
2397
- ; GFX1250-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
2398
- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
2399
- ; GFX1250-NEXT: v_add_f64_e32 v[0:1], v[6:7], v[4:5]
2400
- ; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[6:7]
2309
+ ; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
2310
+ ; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
2401
2311
; GFX1250-NEXT: s_wait_dscnt 0x0
2402
- ; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7]
2403
- ; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
2404
- ; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
2405
- ; GFX1250-NEXT: s_cbranch_execnz .LBB57_1
2406
- ; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
2407
- ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
2408
2312
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
2409
2313
main_body:
2410
2314
%ret = call double @llvm.amdgcn.ds.fadd.f64 (ptr addrspace (3 ) %ptr , double %data , i32 0 , i32 0 , i1 0 )
@@ -2434,24 +2338,9 @@ define double @local_atomic_fadd_f64_rtn_ieee_safe(ptr addrspace(3) %ptr, double
2434
2338
; GFX1250: ; %bb.0: ; %main_body
2435
2339
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
2436
2340
; GFX1250-NEXT: s_wait_kmcnt 0x0
2437
- ; GFX1250-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v2, v0
2438
- ; GFX1250-NEXT: v_mov_b32_e32 v4, v1
2439
- ; GFX1250-NEXT: ds_load_b64 v[0:1], v0
2440
- ; GFX1250-NEXT: s_mov_b32 s0, 0
2441
- ; GFX1250-NEXT: .LBB58_1: ; %atomicrmw.start
2442
- ; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
2443
- ; GFX1250-NEXT: s_wait_dscnt 0x0
2444
- ; GFX1250-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
2445
- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
2446
- ; GFX1250-NEXT: v_add_f64_e32 v[0:1], v[6:7], v[4:5]
2447
- ; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[6:7]
2341
+ ; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
2342
+ ; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
2448
2343
; GFX1250-NEXT: s_wait_dscnt 0x0
2449
- ; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7]
2450
- ; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
2451
- ; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
2452
- ; GFX1250-NEXT: s_cbranch_execnz .LBB58_1
2453
- ; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
2454
- ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
2455
2344
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
2456
2345
main_body:
2457
2346
%ret = call double @llvm.amdgcn.ds.fadd.f64 (ptr addrspace (3 ) %ptr , double %data , i32 0 , i32 0 , i1 0 )
0 commit comments