@@ -1524,9 +1524,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
15241524; GFX900-NEXT: v_lshrrev_b32_sdwa v1, s5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
15251525; GFX900-NEXT: v_add_f32_e32 v1, 4.0, v1
15261526; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1
1527- ; GFX900-NEXT: v_and_b32_e32 v4, 0xff800000 , v1
1527+ ; GFX900-NEXT: v_or_b32_e32 v4, 0x400000 , v1
15281528; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4
1529- ; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v4
15301529; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
15311530; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
15321531; GFX900-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -1566,9 +1565,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
15661565; GFX908-NEXT: v_lshrrev_b32_sdwa v1, s5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
15671566; GFX908-NEXT: v_add_f32_e32 v1, 4.0, v1
15681567; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1
1569- ; GFX908-NEXT: v_and_b32_e32 v4, 0xff800000 , v1
1568+ ; GFX908-NEXT: v_or_b32_e32 v4, 0x400000 , v1
15701569; GFX908-NEXT: v_add3_u32 v3, v3, v1, s4
1571- ; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v4
15721570; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
15731571; GFX908-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
15741572; GFX908-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -1608,9 +1606,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
16081606; GFX90A-NEXT: v_lshrrev_b32_sdwa v1, s5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
16091607; GFX90A-NEXT: v_add_f32_e32 v1, 4.0, v1
16101608; GFX90A-NEXT: v_bfe_u32 v2, v1, 16, 1
1611- ; GFX90A-NEXT: v_and_b32_e32 v4, 0xff800000 , v1
1609+ ; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000 , v1
16121610; GFX90A-NEXT: v_add3_u32 v2, v2, v1, s4
1613- ; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v4
16141611; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
16151612; GFX90A-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
16161613; GFX90A-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -1632,7 +1629,6 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
16321629; GFX10: ; %bb.0:
16331630; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
16341631; GFX10-NEXT: v_mov_b32_e32 v0, 0
1635- ; GFX10-NEXT: s_mov_b32 s5, 0xff800000
16361632; GFX10-NEXT: s_waitcnt lgkmcnt(0)
16371633; GFX10-NEXT: s_and_b32 s0, s2, -4
16381634; GFX10-NEXT: s_mov_b32 s1, s3
@@ -1650,7 +1646,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
16501646; GFX10-NEXT: v_lshrrev_b32_sdwa v1, s2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
16511647; GFX10-NEXT: v_add_f32_e32 v1, 4.0, v1
16521648; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
1653- ; GFX10-NEXT: v_and_or_b32 v4, v1, s5, 0x400000
1649+ ; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1
16541650; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
16551651; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
16561652; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
@@ -1673,7 +1669,6 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
16731669; GFX11-LABEL: global_atomic_fadd_ret_bf16_agent:
16741670; GFX11: ; %bb.0:
16751671; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
1676- ; GFX11-NEXT: s_mov_b32 s5, 0xff800000
16771672; GFX11-NEXT: v_mov_b32_e32 v0, 0
16781673; GFX11-NEXT: s_waitcnt lgkmcnt(0)
16791674; GFX11-NEXT: s_and_b32 s0, s2, -4
@@ -1694,7 +1689,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
16941689; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
16951690; GFX11-NEXT: v_add_f32_e32 v1, 4.0, v1
16961691; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
1697- ; GFX11-NEXT: v_and_or_b32 v4, v1, s5, 0x400000
1692+ ; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1
16981693; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
16991694; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
17001695; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
@@ -1744,9 +1739,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
17441739; GFX900-NEXT: v_lshrrev_b32_sdwa v1, s5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
17451740; GFX900-NEXT: v_add_f32_e32 v1, 4.0, v1
17461741; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1
1747- ; GFX900-NEXT: v_and_b32_e32 v4, 0xff800000 , v1
1742+ ; GFX900-NEXT: v_or_b32_e32 v4, 0x400000 , v1
17481743; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4
1749- ; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v4
17501744; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
17511745; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
17521746; GFX900-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -1786,9 +1780,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
17861780; GFX908-NEXT: v_lshrrev_b32_sdwa v1, s5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
17871781; GFX908-NEXT: v_add_f32_e32 v1, 4.0, v1
17881782; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1
1789- ; GFX908-NEXT: v_and_b32_e32 v4, 0xff800000 , v1
1783+ ; GFX908-NEXT: v_or_b32_e32 v4, 0x400000 , v1
17901784; GFX908-NEXT: v_add3_u32 v3, v3, v1, s4
1791- ; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v4
17921785; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
17931786; GFX908-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
17941787; GFX908-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -1828,9 +1821,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
18281821; GFX90A-NEXT: v_lshrrev_b32_sdwa v1, s5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
18291822; GFX90A-NEXT: v_add_f32_e32 v1, 4.0, v1
18301823; GFX90A-NEXT: v_bfe_u32 v2, v1, 16, 1
1831- ; GFX90A-NEXT: v_and_b32_e32 v4, 0xff800000 , v1
1824+ ; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000 , v1
18321825; GFX90A-NEXT: v_add3_u32 v2, v2, v1, s4
1833- ; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v4
18341826; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
18351827; GFX90A-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
18361828; GFX90A-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -1854,7 +1846,6 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
18541846; GFX10: ; %bb.0:
18551847; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
18561848; GFX10-NEXT: v_mov_b32_e32 v0, 0
1857- ; GFX10-NEXT: s_mov_b32 s5, 0xff800000
18581849; GFX10-NEXT: s_waitcnt lgkmcnt(0)
18591850; GFX10-NEXT: s_and_b32 s0, s2, -4
18601851; GFX10-NEXT: s_mov_b32 s1, s3
@@ -1872,7 +1863,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
18721863; GFX10-NEXT: v_lshrrev_b32_sdwa v1, s2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
18731864; GFX10-NEXT: v_add_f32_e32 v1, 4.0, v1
18741865; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
1875- ; GFX10-NEXT: v_and_or_b32 v4, v1, s5, 0x400000
1866+ ; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1
18761867; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
18771868; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
18781869; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
@@ -1895,7 +1886,6 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
18951886; GFX11-LABEL: global_atomic_fadd_ret_bf16_system:
18961887; GFX11: ; %bb.0:
18971888; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
1898- ; GFX11-NEXT: s_mov_b32 s5, 0xff800000
18991889; GFX11-NEXT: v_mov_b32_e32 v0, 0
19001890; GFX11-NEXT: s_waitcnt lgkmcnt(0)
19011891; GFX11-NEXT: s_and_b32 s0, s2, -4
@@ -1916,7 +1906,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
19161906; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
19171907; GFX11-NEXT: v_add_f32_e32 v1, 4.0, v1
19181908; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
1919- ; GFX11-NEXT: v_and_or_b32 v4, v1, s5, 0x400000
1909+ ; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1
19201910; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
19211911; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
19221912; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
0 commit comments