@@ -1717,11 +1717,11 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1,
17171717; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
17181718; GFX7-NEXT: s_waitcnt vmcnt(0)
17191719; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0
1720- ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
17211720; GFX7-NEXT: s_waitcnt lgkmcnt(0)
17221721; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s0
1722+ ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
1723+ ; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v1
17231724; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1
1724- ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
17251725; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
17261726; GFX7-NEXT: s_endpgm
17271727;
@@ -1748,8 +1748,8 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1,
17481748; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
17491749; GFX8-NEXT: s_waitcnt lgkmcnt(0)
17501750; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s0
1751- ; GFX8-NEXT: v_mad_u32_u24 v1, v2, v1 , v0
1752- ; GFX8-NEXT: v_add_u32_e32 v2, vcc , v1, v0
1751+ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0 , v0
1752+ ; GFX8-NEXT: v_mad_u32_u24 v2, v2 , v1, v0
17531753; GFX8-NEXT: v_mov_b32_e32 v0, s4
17541754; GFX8-NEXT: v_mov_b32_e32 v1, s5
17551755; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -1765,13 +1765,16 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1,
17651765; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
17661766; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
17671767; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
1768+ ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
1769+ ; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xffff, v1
17681770; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
1769- ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1771+ ; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xffff, v2
17701772; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 16, v1
17711773; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 16, v2
17721774; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
17731775; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s0
1774- ; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v1
1776+ ; GFX9-NODL-NEXT: v_add_u32_e32 v1, v1, v1
1777+ ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v4, v3, v1
17751778; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
17761779; GFX9-NODL-NEXT: s_endpgm
17771780;
@@ -1785,13 +1788,16 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1,
17851788; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
17861789; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
17871790; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
1791+ ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
1792+ ; GFX9-DL-NEXT: v_and_b32_e32 v3, 0xffff, v1
17881793; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
1789- ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1794+ ; GFX9-DL-NEXT: v_and_b32_e32 v4, 0xffff, v2
17901795; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 16, v1
17911796; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 16, v2
17921797; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
17931798; GFX9-DL-NEXT: v_mad_u32_u24 v1, v2, v1, s0
1794- ; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v1
1799+ ; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v1
1800+ ; GFX9-DL-NEXT: v_mad_u32_u24 v1, v4, v3, v1
17951801; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
17961802; GFX9-DL-NEXT: s_endpgm
17971803;
@@ -1811,12 +1817,14 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1,
18111817; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v1
18121818; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
18131819; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v2
1814- ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2 , v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1815- ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
1820+ ; GFX10-DL-NEXT: v_and_b32_e32 v1, 0xffff , v1
1821+ ; GFX10-DL-NEXT: v_and_b32_e32 v2, 0xffff, v2
18161822; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
18171823; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s0
1818- ; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v0
1819- ; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7]
1824+ ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
1825+ ; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, v0, v0
1826+ ; GFX10-DL-NEXT: v_mad_u32_u24 v0, v2, v1, v0
1827+ ; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7]
18201828; GFX10-DL-NEXT: s_endpgm
18211829 ptr addrspace (1 ) %src2 ,
18221830 ptr addrspace (1 ) nocapture %dst ) {
@@ -1873,8 +1881,8 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1,
18731881; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0
18741882; GFX7-NEXT: s_waitcnt lgkmcnt(0)
18751883; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, s0
1876- ; GFX7-NEXT: v_mad_i32_i24 v1, v3, v1 , v0
1877- ; GFX7-NEXT: v_add_i32_e32 v0, vcc , v1, v0
1884+ ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0 , v0
1885+ ; GFX7-NEXT: v_mad_i32_i24 v0, v3 , v1, v0
18781886; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
18791887; GFX7-NEXT: s_endpgm
18801888;
@@ -1901,8 +1909,8 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1,
19011909; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0
19021910; GFX8-NEXT: s_waitcnt lgkmcnt(0)
19031911; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s0
1904- ; GFX8-NEXT: v_mad_i32_i24 v1, v2, v1 , v0
1905- ; GFX8-NEXT: v_add_u32_e32 v2, vcc , v1, v0
1912+ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0 , v0
1913+ ; GFX8-NEXT: v_mad_i32_i24 v2, v2 , v1, v0
19061914; GFX8-NEXT: v_mov_b32_e32 v0, s4
19071915; GFX8-NEXT: v_mov_b32_e32 v1, s5
19081916; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -1918,13 +1926,16 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1,
19181926; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
19191927; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
19201928; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
1929+ ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
1930+ ; GFX9-NODL-NEXT: v_bfe_i32 v3, v1, 0, 16
19211931; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
1922- ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1932+ ; GFX9-NODL-NEXT: v_bfe_i32 v4, v2, 0, 16
19231933; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v1, 16, v1
19241934; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v2, 16, v2
19251935; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
19261936; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, s0
1927- ; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v1
1937+ ; GFX9-NODL-NEXT: v_add_u32_e32 v1, v1, v1
1938+ ; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v4, v3, v1
19281939; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
19291940; GFX9-NODL-NEXT: s_endpgm
19301941;
@@ -1938,13 +1949,16 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1,
19381949; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
19391950; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
19401951; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
1952+ ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
1953+ ; GFX9-DL-NEXT: v_bfe_i32 v3, v1, 0, 16
19411954; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
1942- ; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1955+ ; GFX9-DL-NEXT: v_bfe_i32 v4, v2, 0, 16
19431956; GFX9-DL-NEXT: v_ashrrev_i32_e32 v1, 16, v1
19441957; GFX9-DL-NEXT: v_ashrrev_i32_e32 v2, 16, v2
19451958; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
19461959; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, s0
1947- ; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v1
1960+ ; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v1
1961+ ; GFX9-DL-NEXT: v_mad_i32_i24 v1, v4, v3, v1
19481962; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
19491963; GFX9-DL-NEXT: s_endpgm
19501964;
@@ -1964,12 +1978,14 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1,
19641978; GFX10-DL-NEXT: v_ashrrev_i32_e32 v0, 16, v1
19651979; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
19661980; GFX10-DL-NEXT: v_ashrrev_i32_e32 v3, 16, v2
1967- ; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1968- ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
1981+ ; GFX10-DL-NEXT: v_bfe_i32 v1, v1, 0, 16
1982+ ; GFX10-DL-NEXT: v_bfe_i32 v2, v2, 0, 16
19691983; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
19701984; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s0
1971- ; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v0
1972- ; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7]
1985+ ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
1986+ ; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, v0, v0
1987+ ; GFX10-DL-NEXT: v_mad_i32_i24 v0, v2, v1, v0
1988+ ; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7]
19731989; GFX10-DL-NEXT: s_endpgm
19741990 ptr addrspace (1 ) %src2 ,
19751991 ptr addrspace (1 ) nocapture %dst ) {
@@ -2341,10 +2357,11 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1,
23412357; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
23422358; GFX7-NEXT: s_waitcnt vmcnt(0)
23432359; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0
2344- ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2345- ; GFX7-NEXT: v_mad_u32_u24 v4, v3, v1, s0
2346- ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
2360+ ; GFX7-NEXT: v_mul_u32_u24_e32 v4, v3, v1
23472361; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, v4
2362+ ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
2363+ ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2364+ ; GFX7-NEXT: v_add_i32_e32 v1, vcc, s0, v1
23482365; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1
23492366; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
23502367; GFX7-NEXT: s_endpgm
@@ -2370,9 +2387,10 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1,
23702387; GFX8-NEXT: s_waitcnt vmcnt(0)
23712388; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0
23722389; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
2373- ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2374- ; GFX8-NEXT: v_mad_u32_u24 v4, v0, v3, s0
2390+ ; GFX8-NEXT: v_mul_u32_u24_e32 v4, v0, v3
23752391; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, v4
2392+ ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2393+ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
23762394; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0
23772395; GFX8-NEXT: v_mov_b32_e32 v0, s4
23782396; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -2394,9 +2412,9 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1,
23942412; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 16, v1
23952413; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 16, v2
23962414; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v4, v2, v1
2415+ ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, v4
23972416; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
2398- ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s0
2399- ; GFX9-NODL-NEXT: v_add3_u32 v1, v4, v1, v3
2417+ ; GFX9-NODL-NEXT: v_add3_u32 v1, s0, v1, v3
24002418; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
24012419; GFX9-NODL-NEXT: s_endpgm
24022420;
@@ -2415,9 +2433,9 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1,
24152433; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 16, v1
24162434; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 16, v2
24172435; GFX9-DL-NEXT: v_mul_u32_u24_e32 v4, v2, v1
2436+ ; GFX9-DL-NEXT: v_mad_u32_u24 v1, v2, v1, v4
24182437; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2419- ; GFX9-DL-NEXT: v_mad_u32_u24 v1, v2, v1, s0
2420- ; GFX9-DL-NEXT: v_add3_u32 v1, v4, v1, v3
2438+ ; GFX9-DL-NEXT: v_add3_u32 v1, s0, v1, v3
24212439; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
24222440; GFX9-DL-NEXT: s_endpgm
24232441;
@@ -2438,12 +2456,12 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1,
24382456; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
24392457; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v2
24402458; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
2441- ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v2, v3, v0
2459+ ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
2460+ ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v4, v3, v0
2461+ ; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, v4
24422462; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2443- ; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s0
2444- ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
2445- ; GFX10-DL-NEXT: v_add3_u32 v0, v2, v0, v1
2446- ; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7]
2463+ ; GFX10-DL-NEXT: v_add3_u32 v0, s0, v0, v1
2464+ ; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7]
24472465; GFX10-DL-NEXT: s_endpgm
24482466 ptr addrspace (1 ) %src2 ,
24492467 ptr addrspace (1 ) nocapture %dst ) {
@@ -2499,9 +2517,10 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
24992517; GFX7-NEXT: s_waitcnt vmcnt(0)
25002518; GFX7-NEXT: v_bfe_i32 v3, v0, 0, 16
25012519; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0
2502- ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2503- ; GFX7-NEXT: v_mad_i32_i24 v4, v0, v2, s0
2520+ ; GFX7-NEXT: v_mul_i32_i24_e32 v4, v0, v2
25042521; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, v4
2522+ ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2523+ ; GFX7-NEXT: v_add_i32_e32 v0, vcc, s0, v0
25052524; GFX7-NEXT: v_mad_i32_i24 v0, v3, v1, v0
25062525; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
25072526; GFX7-NEXT: s_endpgm
@@ -2527,9 +2546,10 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
25272546; GFX8-NEXT: s_waitcnt vmcnt(0)
25282547; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16
25292548; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0
2530- ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2531- ; GFX8-NEXT: v_mad_i32_i24 v4, v0, v3, s0
2549+ ; GFX8-NEXT: v_mul_i32_i24_e32 v4, v0, v3
25322550; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, v4
2551+ ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2552+ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
25332553; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0
25342554; GFX8-NEXT: v_mov_b32_e32 v0, s4
25352555; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -2551,9 +2571,9 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
25512571; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v1, 16, v1
25522572; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v2, 16, v2
25532573; GFX9-NODL-NEXT: v_mul_i32_i24_e32 v4, v2, v1
2574+ ; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, v4
25542575; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
2555- ; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, s0
2556- ; GFX9-NODL-NEXT: v_add3_u32 v1, v4, v1, v3
2576+ ; GFX9-NODL-NEXT: v_add3_u32 v1, s0, v1, v3
25572577; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
25582578; GFX9-NODL-NEXT: s_endpgm
25592579;
@@ -2572,9 +2592,9 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
25722592; GFX9-DL-NEXT: v_ashrrev_i32_e32 v1, 16, v1
25732593; GFX9-DL-NEXT: v_ashrrev_i32_e32 v2, 16, v2
25742594; GFX9-DL-NEXT: v_mul_i32_i24_e32 v4, v2, v1
2595+ ; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, v4
25752596; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2576- ; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, s0
2577- ; GFX9-DL-NEXT: v_add3_u32 v1, v4, v1, v3
2597+ ; GFX9-DL-NEXT: v_add3_u32 v1, s0, v1, v3
25782598; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
25792599; GFX9-DL-NEXT: s_endpgm
25802600;
@@ -2595,12 +2615,12 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
25952615; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
25962616; GFX10-DL-NEXT: v_ashrrev_i32_e32 v3, 16, v2
25972617; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
2598- ; GFX10-DL-NEXT: v_mul_i32_i24_e32 v2, v3, v0
2618+ ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
2619+ ; GFX10-DL-NEXT: v_mul_i32_i24_e32 v4, v3, v0
2620+ ; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, v4
25992621; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2600- ; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s0
2601- ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
2602- ; GFX10-DL-NEXT: v_add3_u32 v0, v2, v0, v1
2603- ; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7]
2622+ ; GFX10-DL-NEXT: v_add3_u32 v0, s0, v0, v1
2623+ ; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7]
26042624; GFX10-DL-NEXT: s_endpgm
26052625 ptr addrspace (1 ) %src2 ,
26062626 ptr addrspace (1 ) nocapture %dst ) {
0 commit comments