Skip to content

Commit 0f9b06a

Browse files
arsenmmahesh-attarde
authored andcommitted
PeepholeOpt: Fix losing subregister indexes on full copies (llvm#161310)
Previously if we had a subregister extract reading from a full copy, the no-subregister incoming copy would overwrite the DefSubReg index of the folding context. There's one ugly rvv regression, but it's a downstream issue of this; an unnecessary same class reg-to-reg full copy was avoided.
1 parent e1960a5 commit 0f9b06a

34 files changed

+2916
-3127
lines changed

llvm/lib/CodeGen/PeepholeOptimizer.cpp

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1929,7 +1929,27 @@ ValueTrackerResult ValueTracker::getNextSourceFromCopy() {
19291929
const MachineOperand &Src = Def->getOperand(1);
19301930
if (Src.isUndef())
19311931
return ValueTrackerResult();
1932-
return ValueTrackerResult(Src.getReg(), Src.getSubReg());
1932+
1933+
Register SrcReg = Src.getReg();
1934+
unsigned SubReg = Src.getSubReg();
1935+
if (DefSubReg) {
1936+
const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
1937+
SubReg = TRI->composeSubRegIndices(SubReg, DefSubReg);
1938+
1939+
if (SrcReg.isVirtual()) {
1940+
// TODO: Try constraining on rewrite if we can
1941+
const TargetRegisterClass *RegRC = MRI.getRegClass(SrcReg);
1942+
const TargetRegisterClass *SrcWithSubRC =
1943+
TRI->getSubClassWithSubReg(RegRC, SubReg);
1944+
if (RegRC != SrcWithSubRC)
1945+
return ValueTrackerResult();
1946+
} else {
1947+
if (!TRI->getSubReg(SrcReg, SubReg))
1948+
return ValueTrackerResult();
1949+
}
1950+
}
1951+
1952+
return ValueTrackerResult(SrcReg, SubReg);
19331953
}
19341954

19351955
ValueTrackerResult ValueTracker::getNextSourceFromBitcast() {

llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll

Lines changed: 50 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1812,26 +1812,26 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
18121812
; GFX12-NEXT: s_wait_samplecnt 0x0
18131813
; GFX12-NEXT: s_wait_bvhcnt 0x0
18141814
; GFX12-NEXT: s_wait_kmcnt 0x0
1815-
; GFX12-NEXT: v_mov_b32_e32 v6, s16
1816-
; GFX12-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1
1815+
; GFX12-NEXT: v_mov_b32_e32 v8, s16
1816+
; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[0:1], v[0:1]
18171817
; GFX12-NEXT: s_mov_b32 s4, 0
1818-
; GFX12-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], null offen
1819-
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
1818+
; GFX12-NEXT: buffer_load_b64 v[4:5], v8, s[0:3], null offen
18201819
; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start
18211820
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
18221821
; GFX12-NEXT: s_wait_loadcnt 0x0
1823-
; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
1822+
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
18241823
; GFX12-NEXT: s_wait_storecnt 0x0
18251824
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1826-
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
1827-
; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5]
1828-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1829-
; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
1830-
; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
1831-
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
1825+
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[0:1], v[6:7]
1826+
; GFX12-NEXT: v_mov_b32_e32 v0, v2
1827+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
1828+
; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
1829+
; GFX12-NEXT: v_mov_b32_e32 v3, v5
1830+
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], null offen th:TH_ATOMIC_RETURN
18321831
; GFX12-NEXT: s_wait_loadcnt 0x0
18331832
; GFX12-NEXT: global_inv scope:SCOPE_DEV
1834-
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
1833+
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[4:5]
1834+
; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
18351835
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
18361836
; GFX12-NEXT: s_wait_alu 0xfffe
18371837
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1854,27 +1854,27 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
18541854
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
18551855
; GFX11: ; %bb.0:
18561856
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1857-
; GFX11-NEXT: v_mov_b32_e32 v6, s16
1858-
; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1
1857+
; GFX11-NEXT: v_mov_b32_e32 v8, s16
1858+
; GFX11-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1]
18591859
; GFX11-NEXT: s_mov_b32 s4, 0
1860-
; GFX11-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], 0 offen
1861-
; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
1860+
; GFX11-NEXT: buffer_load_b64 v[4:5], v8, s[0:3], 0 offen
18621861
; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start
18631862
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
18641863
; GFX11-NEXT: s_waitcnt vmcnt(0)
1865-
; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
1864+
; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
18661865
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
18671866
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1868-
; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
1869-
; GFX11-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5]
1870-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1871-
; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
1872-
; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
1873-
; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
1867+
; GFX11-NEXT: v_max_f64 v[2:3], v[0:1], v[6:7]
1868+
; GFX11-NEXT: v_mov_b32_e32 v0, v2
1869+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
1870+
; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
1871+
; GFX11-NEXT: v_mov_b32_e32 v3, v5
1872+
; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], 0 offen glc
18741873
; GFX11-NEXT: s_waitcnt vmcnt(0)
18751874
; GFX11-NEXT: buffer_gl1_inv
18761875
; GFX11-NEXT: buffer_gl0_inv
1877-
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
1876+
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[4:5]
1877+
; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
18781878
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
18791879
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
18801880
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1906,28 +1906,26 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
19061906
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
19071907
; GFX908: ; %bb.0:
19081908
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1909-
; GFX908-NEXT: v_mov_b32_e32 v6, s20
1910-
; GFX908-NEXT: v_mov_b32_e32 v2, v0
1911-
; GFX908-NEXT: v_mov_b32_e32 v3, v1
1912-
; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v6, s[16:19], 0 offen
1913-
; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
1909+
; GFX908-NEXT: v_mov_b32_e32 v8, s20
1910+
; GFX908-NEXT: buffer_load_dwordx2 v[4:5], v8, s[16:19], 0 offen
1911+
; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1]
19141912
; GFX908-NEXT: s_mov_b64 s[4:5], 0
19151913
; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start
19161914
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
19171915
; GFX908-NEXT: s_waitcnt vmcnt(0)
1918-
; GFX908-NEXT: v_mov_b32_e32 v10, v1
1919-
; GFX908-NEXT: v_mov_b32_e32 v9, v0
1920-
; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
1921-
; GFX908-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5]
1922-
; GFX908-NEXT: v_mov_b32_e32 v0, v7
1923-
; GFX908-NEXT: v_mov_b32_e32 v1, v8
1924-
; GFX908-NEXT: v_mov_b32_e32 v2, v9
1925-
; GFX908-NEXT: v_mov_b32_e32 v3, v10
1926-
; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
1916+
; GFX908-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
1917+
; GFX908-NEXT: v_max_f64 v[2:3], v[0:1], v[6:7]
1918+
; GFX908-NEXT: v_mov_b32_e32 v0, v2
1919+
; GFX908-NEXT: v_mov_b32_e32 v1, v3
1920+
; GFX908-NEXT: v_mov_b32_e32 v2, v4
1921+
; GFX908-NEXT: v_mov_b32_e32 v3, v5
1922+
; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen glc
19271923
; GFX908-NEXT: s_waitcnt vmcnt(0)
19281924
; GFX908-NEXT: buffer_wbinvl1
1929-
; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
1925+
; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
1926+
; GFX908-NEXT: v_mov_b32_e32 v5, v1
19301927
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1928+
; GFX908-NEXT: v_mov_b32_e32 v4, v0
19311929
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
19321930
; GFX908-NEXT: s_cbranch_execnz .LBB14_1
19331931
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1937,28 +1935,26 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
19371935
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
19381936
; GFX8: ; %bb.0:
19391937
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1940-
; GFX8-NEXT: v_mov_b32_e32 v6, s20
1941-
; GFX8-NEXT: v_mov_b32_e32 v2, v0
1942-
; GFX8-NEXT: v_mov_b32_e32 v3, v1
1943-
; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v6, s[16:19], 0 offen
1944-
; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
1938+
; GFX8-NEXT: v_mov_b32_e32 v8, s20
1939+
; GFX8-NEXT: buffer_load_dwordx2 v[4:5], v8, s[16:19], 0 offen
1940+
; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1]
19451941
; GFX8-NEXT: s_mov_b64 s[4:5], 0
19461942
; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start
19471943
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
19481944
; GFX8-NEXT: s_waitcnt vmcnt(0)
1949-
; GFX8-NEXT: v_mov_b32_e32 v10, v1
1950-
; GFX8-NEXT: v_mov_b32_e32 v9, v0
1951-
; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
1952-
; GFX8-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5]
1953-
; GFX8-NEXT: v_mov_b32_e32 v0, v7
1954-
; GFX8-NEXT: v_mov_b32_e32 v1, v8
1955-
; GFX8-NEXT: v_mov_b32_e32 v2, v9
1956-
; GFX8-NEXT: v_mov_b32_e32 v3, v10
1957-
; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
1945+
; GFX8-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
1946+
; GFX8-NEXT: v_max_f64 v[2:3], v[0:1], v[6:7]
1947+
; GFX8-NEXT: v_mov_b32_e32 v0, v2
1948+
; GFX8-NEXT: v_mov_b32_e32 v1, v3
1949+
; GFX8-NEXT: v_mov_b32_e32 v2, v4
1950+
; GFX8-NEXT: v_mov_b32_e32 v3, v5
1951+
; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen glc
19581952
; GFX8-NEXT: s_waitcnt vmcnt(0)
19591953
; GFX8-NEXT: buffer_wbinvl1
1960-
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
1954+
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
1955+
; GFX8-NEXT: v_mov_b32_e32 v5, v1
19611956
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1957+
; GFX8-NEXT: v_mov_b32_e32 v4, v0
19621958
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
19631959
; GFX8-NEXT: s_cbranch_execnz .LBB14_1
19641960
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end

llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll

Lines changed: 50 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1812,26 +1812,26 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_
18121812
; GFX12-NEXT: s_wait_samplecnt 0x0
18131813
; GFX12-NEXT: s_wait_bvhcnt 0x0
18141814
; GFX12-NEXT: s_wait_kmcnt 0x0
1815-
; GFX12-NEXT: v_mov_b32_e32 v6, s16
1816-
; GFX12-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1
1815+
; GFX12-NEXT: v_mov_b32_e32 v8, s16
1816+
; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[0:1], v[0:1]
18171817
; GFX12-NEXT: s_mov_b32 s4, 0
1818-
; GFX12-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], null offen
1819-
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
1818+
; GFX12-NEXT: buffer_load_b64 v[4:5], v8, s[0:3], null offen
18201819
; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start
18211820
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
18221821
; GFX12-NEXT: s_wait_loadcnt 0x0
1823-
; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
1822+
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
18241823
; GFX12-NEXT: s_wait_storecnt 0x0
18251824
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1826-
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
1827-
; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5]
1828-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1829-
; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
1830-
; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
1831-
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
1825+
; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[0:1], v[6:7]
1826+
; GFX12-NEXT: v_mov_b32_e32 v0, v2
1827+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
1828+
; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
1829+
; GFX12-NEXT: v_mov_b32_e32 v3, v5
1830+
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], null offen th:TH_ATOMIC_RETURN
18321831
; GFX12-NEXT: s_wait_loadcnt 0x0
18331832
; GFX12-NEXT: global_inv scope:SCOPE_DEV
1834-
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
1833+
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[4:5]
1834+
; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
18351835
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
18361836
; GFX12-NEXT: s_wait_alu 0xfffe
18371837
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1854,27 +1854,27 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_
18541854
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
18551855
; GFX11: ; %bb.0:
18561856
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1857-
; GFX11-NEXT: v_mov_b32_e32 v6, s16
1858-
; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1
1857+
; GFX11-NEXT: v_mov_b32_e32 v8, s16
1858+
; GFX11-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1]
18591859
; GFX11-NEXT: s_mov_b32 s4, 0
1860-
; GFX11-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], 0 offen
1861-
; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
1860+
; GFX11-NEXT: buffer_load_b64 v[4:5], v8, s[0:3], 0 offen
18621861
; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start
18631862
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
18641863
; GFX11-NEXT: s_waitcnt vmcnt(0)
1865-
; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
1864+
; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
18661865
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
18671866
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1868-
; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
1869-
; GFX11-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5]
1870-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1871-
; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
1872-
; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
1873-
; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
1867+
; GFX11-NEXT: v_min_f64 v[2:3], v[0:1], v[6:7]
1868+
; GFX11-NEXT: v_mov_b32_e32 v0, v2
1869+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
1870+
; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
1871+
; GFX11-NEXT: v_mov_b32_e32 v3, v5
1872+
; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], 0 offen glc
18741873
; GFX11-NEXT: s_waitcnt vmcnt(0)
18751874
; GFX11-NEXT: buffer_gl1_inv
18761875
; GFX11-NEXT: buffer_gl0_inv
1877-
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
1876+
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[4:5]
1877+
; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
18781878
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
18791879
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
18801880
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1906,28 +1906,26 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_
19061906
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
19071907
; GFX908: ; %bb.0:
19081908
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1909-
; GFX908-NEXT: v_mov_b32_e32 v6, s20
1910-
; GFX908-NEXT: v_mov_b32_e32 v2, v0
1911-
; GFX908-NEXT: v_mov_b32_e32 v3, v1
1912-
; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v6, s[16:19], 0 offen
1913-
; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
1909+
; GFX908-NEXT: v_mov_b32_e32 v8, s20
1910+
; GFX908-NEXT: buffer_load_dwordx2 v[4:5], v8, s[16:19], 0 offen
1911+
; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1]
19141912
; GFX908-NEXT: s_mov_b64 s[4:5], 0
19151913
; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start
19161914
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
19171915
; GFX908-NEXT: s_waitcnt vmcnt(0)
1918-
; GFX908-NEXT: v_mov_b32_e32 v10, v1
1919-
; GFX908-NEXT: v_mov_b32_e32 v9, v0
1920-
; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
1921-
; GFX908-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5]
1922-
; GFX908-NEXT: v_mov_b32_e32 v0, v7
1923-
; GFX908-NEXT: v_mov_b32_e32 v1, v8
1924-
; GFX908-NEXT: v_mov_b32_e32 v2, v9
1925-
; GFX908-NEXT: v_mov_b32_e32 v3, v10
1926-
; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
1916+
; GFX908-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
1917+
; GFX908-NEXT: v_min_f64 v[2:3], v[0:1], v[6:7]
1918+
; GFX908-NEXT: v_mov_b32_e32 v0, v2
1919+
; GFX908-NEXT: v_mov_b32_e32 v1, v3
1920+
; GFX908-NEXT: v_mov_b32_e32 v2, v4
1921+
; GFX908-NEXT: v_mov_b32_e32 v3, v5
1922+
; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen glc
19271923
; GFX908-NEXT: s_waitcnt vmcnt(0)
19281924
; GFX908-NEXT: buffer_wbinvl1
1929-
; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
1925+
; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
1926+
; GFX908-NEXT: v_mov_b32_e32 v5, v1
19301927
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1928+
; GFX908-NEXT: v_mov_b32_e32 v4, v0
19311929
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
19321930
; GFX908-NEXT: s_cbranch_execnz .LBB14_1
19331931
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1937,28 +1935,26 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_
19371935
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
19381936
; GFX8: ; %bb.0:
19391937
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1940-
; GFX8-NEXT: v_mov_b32_e32 v6, s20
1941-
; GFX8-NEXT: v_mov_b32_e32 v2, v0
1942-
; GFX8-NEXT: v_mov_b32_e32 v3, v1
1943-
; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v6, s[16:19], 0 offen
1944-
; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
1938+
; GFX8-NEXT: v_mov_b32_e32 v8, s20
1939+
; GFX8-NEXT: buffer_load_dwordx2 v[4:5], v8, s[16:19], 0 offen
1940+
; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1]
19451941
; GFX8-NEXT: s_mov_b64 s[4:5], 0
19461942
; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start
19471943
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
19481944
; GFX8-NEXT: s_waitcnt vmcnt(0)
1949-
; GFX8-NEXT: v_mov_b32_e32 v10, v1
1950-
; GFX8-NEXT: v_mov_b32_e32 v9, v0
1951-
; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
1952-
; GFX8-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5]
1953-
; GFX8-NEXT: v_mov_b32_e32 v0, v7
1954-
; GFX8-NEXT: v_mov_b32_e32 v1, v8
1955-
; GFX8-NEXT: v_mov_b32_e32 v2, v9
1956-
; GFX8-NEXT: v_mov_b32_e32 v3, v10
1957-
; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
1945+
; GFX8-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
1946+
; GFX8-NEXT: v_min_f64 v[2:3], v[0:1], v[6:7]
1947+
; GFX8-NEXT: v_mov_b32_e32 v0, v2
1948+
; GFX8-NEXT: v_mov_b32_e32 v1, v3
1949+
; GFX8-NEXT: v_mov_b32_e32 v2, v4
1950+
; GFX8-NEXT: v_mov_b32_e32 v3, v5
1951+
; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen glc
19581952
; GFX8-NEXT: s_waitcnt vmcnt(0)
19591953
; GFX8-NEXT: buffer_wbinvl1
1960-
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
1954+
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
1955+
; GFX8-NEXT: v_mov_b32_e32 v5, v1
19611956
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1957+
; GFX8-NEXT: v_mov_b32_e32 v4, v0
19621958
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
19631959
; GFX8-NEXT: s_cbranch_execnz .LBB14_1
19641960
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end

0 commit comments

Comments
 (0)