@@ -602,15 +602,13 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
602602; GFX12-NEXT: s_wait_bvhcnt 0x0
603603; GFX12-NEXT: s_wait_kmcnt 0x0
604604; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
605- ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
606605; GFX12-NEXT: s_mov_b32 s0, 0
607606; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start
608607; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
609608; GFX12-NEXT: s_wait_loadcnt 0x0
610609; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
611- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
612- ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
613- ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
610+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
611+ ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3]
614612; GFX12-NEXT: s_wait_storecnt 0x0
615613; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
616614; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -757,21 +755,18 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
757755; GFX12-NEXT: s_wait_samplecnt 0x0
758756; GFX12-NEXT: s_wait_bvhcnt 0x0
759757; GFX12-NEXT: s_wait_kmcnt 0x0
760- ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
761- ; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
758+ ; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off
762759; GFX12-NEXT: s_mov_b32 s0, 0
763760; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start
764761; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
765762; GFX12-NEXT: s_wait_loadcnt 0x0
766- ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
767- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
768- ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
763+ ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3]
769764; GFX12-NEXT: s_wait_storecnt 0x0
770- ; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3 ], v[0:1], v[2:5 ], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
765+ ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5 ], v[0:1], v[4:7 ], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
771766; GFX12-NEXT: s_wait_loadcnt 0x0
772767; GFX12-NEXT: global_inv scope:SCOPE_DEV
773- ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3 ], v[4:5 ]
774- ; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
768+ ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5 ], v[6:7 ]
769+ ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
775770; GFX12-NEXT: s_wait_alu 0xfffe
776771; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
777772; GFX12-NEXT: s_wait_alu 0xfffe
@@ -1188,15 +1183,13 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
11881183; GFX12-NEXT: s_wait_bvhcnt 0x0
11891184; GFX12-NEXT: s_wait_kmcnt 0x0
11901185; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1]
1191- ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
11921186; GFX12-NEXT: s_mov_b32 s0, 0
11931187; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start
11941188; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
11951189; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
11961190; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
1197- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1198- ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
1199- ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
1191+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1192+ ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3]
12001193; GFX12-NEXT: s_wait_storecnt 0x0
12011194; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
12021195; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1341,21 +1334,18 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
13411334; GFX12-NEXT: s_wait_samplecnt 0x0
13421335; GFX12-NEXT: s_wait_bvhcnt 0x0
13431336; GFX12-NEXT: s_wait_kmcnt 0x0
1344- ; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1]
1345- ; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
1337+ ; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1]
13461338; GFX12-NEXT: s_mov_b32 s0, 0
13471339; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start
13481340; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
13491341; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
1350- ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
1351- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1352- ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
1342+ ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3]
13531343; GFX12-NEXT: s_wait_storecnt 0x0
1354- ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3 ], v[0:1], v[2:5 ] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
1344+ ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5 ], v[0:1], v[4:7 ] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
13551345; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
13561346; GFX12-NEXT: global_inv scope:SCOPE_DEV
1357- ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3 ], v[4:5 ]
1358- ; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
1347+ ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5 ], v[6:7 ]
1348+ ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
13591349; GFX12-NEXT: s_wait_alu 0xfffe
13601350; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
13611351; GFX12-NEXT: s_wait_alu 0xfffe
@@ -1799,19 +1789,16 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
17991789; GFX12-NEXT: s_wait_bvhcnt 0x0
18001790; GFX12-NEXT: s_wait_kmcnt 0x0
18011791; GFX12-NEXT: v_mov_b32_e32 v6, s16
1802- ; GFX12-NEXT: v_dual_mov_b32 v2 , v0 :: v_dual_mov_b32 v3 , v1
1792+ ; GFX12-NEXT: v_dual_mov_b32 v4 , v0 :: v_dual_mov_b32 v5 , v1
18031793; GFX12-NEXT: s_mov_b32 s4, 0
18041794; GFX12-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], null offen
1805- ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
18061795; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start
18071796; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
18081797; GFX12-NEXT: s_wait_loadcnt 0x0
18091798; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
18101799; GFX12-NEXT: s_wait_storecnt 0x0
18111800; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1812- ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
1813- ; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5]
1814- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1801+ ; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[9:10], v[4:5]
18151802; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
18161803; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
18171804; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
@@ -1971,23 +1958,21 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
19711958; GFX12-NEXT: s_wait_bvhcnt 0x0
19721959; GFX12-NEXT: s_wait_kmcnt 0x0
19731960; GFX12-NEXT: v_mov_b32_e32 v6, s16
1974- ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
19751961; GFX12-NEXT: s_mov_b32 s4, 0
1976- ; GFX12-NEXT: buffer_load_b64 v[2:3 ], v6, s[0:3], null offen
1962+ ; GFX12-NEXT: buffer_load_b64 v[4:5 ], v6, s[0:3], null offen
19771963; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start
19781964; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
19791965; GFX12-NEXT: s_wait_loadcnt 0x0
1980- ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
1966+ ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[0:1]
1967+ ; GFX12-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4
19811968; GFX12-NEXT: s_wait_storecnt 0x0
1982- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1983- ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5]
1984- ; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
1985- ; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
1969+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
1970+ ; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
19861971; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
19871972; GFX12-NEXT: s_wait_loadcnt 0x0
19881973; GFX12-NEXT: global_inv scope:SCOPE_DEV
1989- ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3 ]
1990- ; GFX12-NEXT: v_dual_mov_b32 v2 , v7 :: v_dual_mov_b32 v3 , v8
1974+ ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5 ]
1975+ ; GFX12-NEXT: v_dual_mov_b32 v4 , v7 :: v_dual_mov_b32 v5 , v8
19911976; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
19921977; GFX12-NEXT: s_wait_alu 0xfffe
19931978; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
0 commit comments