@@ -602,13 +602,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
602602; GFX12-NEXT: s_wait_bvhcnt 0x0
603603; GFX12-NEXT: s_wait_kmcnt 0x0
604604; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
605+ ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
605606; GFX12-NEXT: s_mov_b32 s0, 0
606607; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start
607608; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
608609; GFX12-NEXT: s_wait_loadcnt 0x0
609610; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
610- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
611- ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3]
611+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
612+ ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
613+ ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
612614; GFX12-NEXT: s_wait_storecnt 0x0
613615; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
614616; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -755,18 +757,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
755757; GFX12-NEXT: s_wait_samplecnt 0x0
756758; GFX12-NEXT: s_wait_bvhcnt 0x0
757759; GFX12-NEXT: s_wait_kmcnt 0x0
758- ; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off
760+ ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
761+ ; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
759762; GFX12-NEXT: s_mov_b32 s0, 0
760763; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start
761764; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
762765; GFX12-NEXT: s_wait_loadcnt 0x0
763- ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3]
766+ ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
767+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
768+ ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
764769; GFX12-NEXT: s_wait_storecnt 0x0
765- ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5 ], v[0:1], v[4:7 ], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
770+ ; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3 ], v[0:1], v[2:5 ], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
766771; GFX12-NEXT: s_wait_loadcnt 0x0
767772; GFX12-NEXT: global_inv scope:SCOPE_DEV
768- ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5 ], v[6:7 ]
769- ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
773+ ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3 ], v[4:5 ]
774+ ; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
770775; GFX12-NEXT: s_wait_alu 0xfffe
771776; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
772777; GFX12-NEXT: s_wait_alu 0xfffe
@@ -1183,13 +1188,15 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
11831188; GFX12-NEXT: s_wait_bvhcnt 0x0
11841189; GFX12-NEXT: s_wait_kmcnt 0x0
11851190; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1]
1191+ ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
11861192; GFX12-NEXT: s_mov_b32 s0, 0
11871193; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start
11881194; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
11891195; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
11901196; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
1191- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1192- ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3]
1197+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1198+ ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
1199+ ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
11931200; GFX12-NEXT: s_wait_storecnt 0x0
11941201; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
11951202; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1334,18 +1341,21 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
13341341; GFX12-NEXT: s_wait_samplecnt 0x0
13351342; GFX12-NEXT: s_wait_bvhcnt 0x0
13361343; GFX12-NEXT: s_wait_kmcnt 0x0
1337- ; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1]
1344+ ; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1]
1345+ ; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
13381346; GFX12-NEXT: s_mov_b32 s0, 0
13391347; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start
13401348; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
13411349; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
1342- ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3]
1350+ ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
1351+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1352+ ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
13431353; GFX12-NEXT: s_wait_storecnt 0x0
1344- ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5 ], v[0:1], v[4:7 ] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
1354+ ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3 ], v[0:1], v[2:5 ] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
13451355; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
13461356; GFX12-NEXT: global_inv scope:SCOPE_DEV
1347- ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5 ], v[6:7 ]
1348- ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
1357+ ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3 ], v[4:5 ]
1358+ ; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
13491359; GFX12-NEXT: s_wait_alu 0xfffe
13501360; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
13511361; GFX12-NEXT: s_wait_alu 0xfffe
@@ -1789,16 +1799,19 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
17891799; GFX12-NEXT: s_wait_bvhcnt 0x0
17901800; GFX12-NEXT: s_wait_kmcnt 0x0
17911801; GFX12-NEXT: v_mov_b32_e32 v6, s16
1792- ; GFX12-NEXT: v_dual_mov_b32 v4 , v0 :: v_dual_mov_b32 v5 , v1
1802+ ; GFX12-NEXT: v_dual_mov_b32 v2 , v0 :: v_dual_mov_b32 v3 , v1
17931803; GFX12-NEXT: s_mov_b32 s4, 0
17941804; GFX12-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], null offen
1805+ ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
17951806; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start
17961807; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
17971808; GFX12-NEXT: s_wait_loadcnt 0x0
17981809; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
17991810; GFX12-NEXT: s_wait_storecnt 0x0
18001811; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1801- ; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[9:10], v[4:5]
1812+ ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
1813+ ; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5]
1814+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
18021815; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
18031816; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
18041817; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
@@ -1958,21 +1971,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
19581971; GFX12-NEXT: s_wait_bvhcnt 0x0
19591972; GFX12-NEXT: s_wait_kmcnt 0x0
19601973; GFX12-NEXT: v_mov_b32_e32 v6, s16
1974+ ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
19611975; GFX12-NEXT: s_mov_b32 s4, 0
1962- ; GFX12-NEXT: buffer_load_b64 v[4:5 ], v6, s[0:3], null offen
1976+ ; GFX12-NEXT: buffer_load_b64 v[2:3 ], v6, s[0:3], null offen
19631977; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start
19641978; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
19651979; GFX12-NEXT: s_wait_loadcnt 0x0
1966- ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[0:1]
1967- ; GFX12-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4
1980+ ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
19681981; GFX12-NEXT: s_wait_storecnt 0x0
1969- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
1970- ; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
1982+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1983+ ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5]
1984+ ; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
1985+ ; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
19711986; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
19721987; GFX12-NEXT: s_wait_loadcnt 0x0
19731988; GFX12-NEXT: global_inv scope:SCOPE_DEV
1974- ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5 ]
1975- ; GFX12-NEXT: v_dual_mov_b32 v4 , v7 :: v_dual_mov_b32 v5 , v8
1989+ ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3 ]
1990+ ; GFX12-NEXT: v_dual_mov_b32 v2 , v7 :: v_dual_mov_b32 v3 , v8
19761991; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
19771992; GFX12-NEXT: s_wait_alu 0xfffe
19781993; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
0 commit comments