@@ -1812,26 +1812,26 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
1812
1812
; GFX12-NEXT: s_wait_samplecnt 0x0
1813
1813
; GFX12-NEXT: s_wait_bvhcnt 0x0
1814
1814
; GFX12-NEXT: s_wait_kmcnt 0x0
1815
- ; GFX12-NEXT: v_mov_b32_e32 v6 , s16
1816
- ; GFX12-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1
1815
+ ; GFX12-NEXT: v_mov_b32_e32 v8 , s16
1816
+ ; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[0:1], v[0:1]
1817
1817
; GFX12-NEXT: s_mov_b32 s4, 0
1818
- ; GFX12-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], null offen
1819
- ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
1818
+ ; GFX12-NEXT: buffer_load_b64 v[4:5], v8, s[0:3], null offen
1820
1819
; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start
1821
1820
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
1822
1821
; GFX12-NEXT: s_wait_loadcnt 0x0
1823
- ; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
1822
+ ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
1824
1823
; GFX12-NEXT: s_wait_storecnt 0x0
1825
1824
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1826
- ; GFX12-NEXT: v_max_num_f64_e32 v[0:1 ], v[9:10 ], v[9:10 ]
1827
- ; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5]
1828
- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1 )
1829
- ; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
1830
- ; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
1831
- ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6 , s[0:3], null offen th:TH_ATOMIC_RETURN
1825
+ ; GFX12-NEXT: v_max_num_f64_e32 v[2:3 ], v[0:1 ], v[6:7 ]
1826
+ ; GFX12-NEXT: v_mov_b32_e32 v0, v2
1827
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2 )
1828
+ ; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
1829
+ ; GFX12-NEXT: v_mov_b32_e32 v3, v5
1830
+ ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8 , s[0:3], null offen th:TH_ATOMIC_RETURN
1832
1831
; GFX12-NEXT: s_wait_loadcnt 0x0
1833
1832
; GFX12-NEXT: global_inv scope:SCOPE_DEV
1834
- ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
1833
+ ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[4:5]
1834
+ ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
1835
1835
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
1836
1836
; GFX12-NEXT: s_wait_alu 0xfffe
1837
1837
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1854,27 +1854,27 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
1854
1854
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
1855
1855
; GFX11: ; %bb.0:
1856
1856
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1857
- ; GFX11-NEXT: v_mov_b32_e32 v6 , s16
1858
- ; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1
1857
+ ; GFX11-NEXT: v_mov_b32_e32 v8 , s16
1858
+ ; GFX11-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1]
1859
1859
; GFX11-NEXT: s_mov_b32 s4, 0
1860
- ; GFX11-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], 0 offen
1861
- ; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
1860
+ ; GFX11-NEXT: buffer_load_b64 v[4:5], v8, s[0:3], 0 offen
1862
1861
; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start
1863
1862
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
1864
1863
; GFX11-NEXT: s_waitcnt vmcnt(0)
1865
- ; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
1864
+ ; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
1866
1865
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1867
1866
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1868
- ; GFX11-NEXT: v_max_f64 v[0:1 ], v[9:10 ], v[9:10 ]
1869
- ; GFX11-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5]
1870
- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1 )
1871
- ; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
1872
- ; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
1873
- ; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6 , s[0:3], 0 offen glc
1867
+ ; GFX11-NEXT: v_max_f64 v[2:3 ], v[0:1 ], v[6:7 ]
1868
+ ; GFX11-NEXT: v_mov_b32_e32 v0, v2
1869
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2 )
1870
+ ; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
1871
+ ; GFX11-NEXT: v_mov_b32_e32 v3, v5
1872
+ ; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8 , s[0:3], 0 offen glc
1874
1873
; GFX11-NEXT: s_waitcnt vmcnt(0)
1875
1874
; GFX11-NEXT: buffer_gl1_inv
1876
1875
; GFX11-NEXT: buffer_gl0_inv
1877
- ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
1876
+ ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[4:5]
1877
+ ; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
1878
1878
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
1879
1879
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1880
1880
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1906,28 +1906,26 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
1906
1906
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
1907
1907
; GFX908: ; %bb.0:
1908
1908
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1909
- ; GFX908-NEXT: v_mov_b32_e32 v6, s20
1910
- ; GFX908-NEXT: v_mov_b32_e32 v2, v0
1911
- ; GFX908-NEXT: v_mov_b32_e32 v3, v1
1912
- ; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v6, s[16:19], 0 offen
1913
- ; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
1909
+ ; GFX908-NEXT: v_mov_b32_e32 v8, s20
1910
+ ; GFX908-NEXT: buffer_load_dwordx2 v[4:5], v8, s[16:19], 0 offen
1911
+ ; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1]
1914
1912
; GFX908-NEXT: s_mov_b64 s[4:5], 0
1915
1913
; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start
1916
1914
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
1917
1915
; GFX908-NEXT: s_waitcnt vmcnt(0)
1918
- ; GFX908-NEXT: v_mov_b32_e32 v10, v1
1919
- ; GFX908-NEXT: v_mov_b32_e32 v9, v0
1920
- ; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
1921
- ; GFX908-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5]
1922
- ; GFX908-NEXT: v_mov_b32_e32 v0, v7
1923
- ; GFX908-NEXT: v_mov_b32_e32 v1, v8
1924
- ; GFX908-NEXT: v_mov_b32_e32 v2, v9
1925
- ; GFX908-NEXT: v_mov_b32_e32 v3, v10
1926
- ; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
1916
+ ; GFX908-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
1917
+ ; GFX908-NEXT: v_max_f64 v[2:3], v[0:1], v[6:7]
1918
+ ; GFX908-NEXT: v_mov_b32_e32 v0, v2
1919
+ ; GFX908-NEXT: v_mov_b32_e32 v1, v3
1920
+ ; GFX908-NEXT: v_mov_b32_e32 v2, v4
1921
+ ; GFX908-NEXT: v_mov_b32_e32 v3, v5
1922
+ ; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen glc
1927
1923
; GFX908-NEXT: s_waitcnt vmcnt(0)
1928
1924
; GFX908-NEXT: buffer_wbinvl1
1929
- ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
1925
+ ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
1926
+ ; GFX908-NEXT: v_mov_b32_e32 v5, v1
1930
1927
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1928
+ ; GFX908-NEXT: v_mov_b32_e32 v4, v0
1931
1929
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
1932
1930
; GFX908-NEXT: s_cbranch_execnz .LBB14_1
1933
1931
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1937,28 +1935,26 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
1937
1935
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
1938
1936
; GFX8: ; %bb.0:
1939
1937
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1940
- ; GFX8-NEXT: v_mov_b32_e32 v6, s20
1941
- ; GFX8-NEXT: v_mov_b32_e32 v2, v0
1942
- ; GFX8-NEXT: v_mov_b32_e32 v3, v1
1943
- ; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v6, s[16:19], 0 offen
1944
- ; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
1938
+ ; GFX8-NEXT: v_mov_b32_e32 v8, s20
1939
+ ; GFX8-NEXT: buffer_load_dwordx2 v[4:5], v8, s[16:19], 0 offen
1940
+ ; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1]
1945
1941
; GFX8-NEXT: s_mov_b64 s[4:5], 0
1946
1942
; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start
1947
1943
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
1948
1944
; GFX8-NEXT: s_waitcnt vmcnt(0)
1949
- ; GFX8-NEXT: v_mov_b32_e32 v10, v1
1950
- ; GFX8-NEXT: v_mov_b32_e32 v9, v0
1951
- ; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
1952
- ; GFX8-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5]
1953
- ; GFX8-NEXT: v_mov_b32_e32 v0, v7
1954
- ; GFX8-NEXT: v_mov_b32_e32 v1, v8
1955
- ; GFX8-NEXT: v_mov_b32_e32 v2, v9
1956
- ; GFX8-NEXT: v_mov_b32_e32 v3, v10
1957
- ; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
1945
+ ; GFX8-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
1946
+ ; GFX8-NEXT: v_max_f64 v[2:3], v[0:1], v[6:7]
1947
+ ; GFX8-NEXT: v_mov_b32_e32 v0, v2
1948
+ ; GFX8-NEXT: v_mov_b32_e32 v1, v3
1949
+ ; GFX8-NEXT: v_mov_b32_e32 v2, v4
1950
+ ; GFX8-NEXT: v_mov_b32_e32 v3, v5
1951
+ ; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen glc
1958
1952
; GFX8-NEXT: s_waitcnt vmcnt(0)
1959
1953
; GFX8-NEXT: buffer_wbinvl1
1960
- ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
1954
+ ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
1955
+ ; GFX8-NEXT: v_mov_b32_e32 v5, v1
1961
1956
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1957
+ ; GFX8-NEXT: v_mov_b32_e32 v4, v0
1962
1958
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
1963
1959
; GFX8-NEXT: s_cbranch_execnz .LBB14_1
1964
1960
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
0 commit comments