@@ -1425,41 +1425,42 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float
14251425; GFX90A: ; %bb.0: ; %entry
14261426; GFX90A-NEXT: s_load_dword s1, s[4:5], 0x2c
14271427; GFX90A-NEXT: s_mov_b32 s0, 16
1428- ; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
14291428; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
14301429; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1431- ; GFX90A-NEXT: v_accvgpr_write_b32 a31, s1
1432- ; GFX90A-NEXT: v_accvgpr_write_b32 a30, s1
1433- ; GFX90A-NEXT: v_accvgpr_write_b32 a29, s1
1434- ; GFX90A-NEXT: v_accvgpr_write_b32 a28, s1
1435- ; GFX90A-NEXT: v_accvgpr_write_b32 a27, s1
1436- ; GFX90A-NEXT: v_accvgpr_write_b32 a26, s1
1437- ; GFX90A-NEXT: v_accvgpr_write_b32 a25, s1
1438- ; GFX90A-NEXT: v_accvgpr_write_b32 a24, s1
1439- ; GFX90A-NEXT: v_accvgpr_write_b32 a23, s1
1440- ; GFX90A-NEXT: v_accvgpr_write_b32 a22, s1
1441- ; GFX90A-NEXT: v_accvgpr_write_b32 a21, s1
1442- ; GFX90A-NEXT: v_accvgpr_write_b32 a20, s1
1443- ; GFX90A-NEXT: v_accvgpr_write_b32 a19, s1
1444- ; GFX90A-NEXT: v_accvgpr_write_b32 a18, s1
1445- ; GFX90A-NEXT: v_accvgpr_write_b32 a17, s1
1446- ; GFX90A-NEXT: v_accvgpr_write_b32 a16, s1
1447- ; GFX90A-NEXT: v_accvgpr_write_b32 a15, s1
1448- ; GFX90A-NEXT: v_accvgpr_write_b32 a14, s1
1449- ; GFX90A-NEXT: v_accvgpr_write_b32 a13, s1
1450- ; GFX90A-NEXT: v_accvgpr_write_b32 a12, s1
1451- ; GFX90A-NEXT: v_accvgpr_write_b32 a11, s1
1452- ; GFX90A-NEXT: v_accvgpr_write_b32 a10, s1
1453- ; GFX90A-NEXT: v_accvgpr_write_b32 a9, s1
1454- ; GFX90A-NEXT: v_accvgpr_write_b32 a8, s1
1455- ; GFX90A-NEXT: v_accvgpr_write_b32 a7, s1
1456- ; GFX90A-NEXT: v_accvgpr_write_b32 a6, s1
1457- ; GFX90A-NEXT: v_accvgpr_write_b32 a5, s1
1458- ; GFX90A-NEXT: v_accvgpr_write_b32 a4, s1
1459- ; GFX90A-NEXT: v_accvgpr_write_b32 a3, s1
1460- ; GFX90A-NEXT: v_accvgpr_write_b32 a2, s1
1461- ; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1
1462- ; GFX90A-NEXT: v_accvgpr_write_b32 a0, s1
1430+ ; GFX90A-NEXT: v_mov_b32_e32 v0, s1
1431+ ; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0
1432+ ; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0
1433+ ; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0
1434+ ; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0
1435+ ; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0
1436+ ; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0
1437+ ; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0
1438+ ; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0
1439+ ; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0
1440+ ; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0
1441+ ; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0
1442+ ; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0
1443+ ; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0
1444+ ; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0
1445+ ; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0
1446+ ; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0
1447+ ; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0
1448+ ; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0
1449+ ; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0
1450+ ; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0
1451+ ; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0
1452+ ; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0
1453+ ; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0
1454+ ; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0
1455+ ; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0
1456+ ; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0
1457+ ; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0
1458+ ; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0
1459+ ; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0
1460+ ; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0
1461+ ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0
1462+ ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
1463+ ; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
14631464; GFX90A-NEXT: .LBB5_1: ; %for.cond.preheader
14641465; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
14651466; GFX90A-NEXT: s_nop 1
@@ -1487,41 +1488,42 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float
14871488; GFX942: ; %bb.0: ; %entry
14881489; GFX942-NEXT: s_load_dword s1, s[4:5], 0x2c
14891490; GFX942-NEXT: s_mov_b32 s0, 16
1490- ; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
14911491; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
14921492; GFX942-NEXT: s_waitcnt lgkmcnt(0)
1493- ; GFX942-NEXT: v_accvgpr_write_b32 a31, s1
1494- ; GFX942-NEXT: v_accvgpr_write_b32 a30, s1
1495- ; GFX942-NEXT: v_accvgpr_write_b32 a29, s1
1496- ; GFX942-NEXT: v_accvgpr_write_b32 a28, s1
1497- ; GFX942-NEXT: v_accvgpr_write_b32 a27, s1
1498- ; GFX942-NEXT: v_accvgpr_write_b32 a26, s1
1499- ; GFX942-NEXT: v_accvgpr_write_b32 a25, s1
1500- ; GFX942-NEXT: v_accvgpr_write_b32 a24, s1
1501- ; GFX942-NEXT: v_accvgpr_write_b32 a23, s1
1502- ; GFX942-NEXT: v_accvgpr_write_b32 a22, s1
1503- ; GFX942-NEXT: v_accvgpr_write_b32 a21, s1
1504- ; GFX942-NEXT: v_accvgpr_write_b32 a20, s1
1505- ; GFX942-NEXT: v_accvgpr_write_b32 a19, s1
1506- ; GFX942-NEXT: v_accvgpr_write_b32 a18, s1
1507- ; GFX942-NEXT: v_accvgpr_write_b32 a17, s1
1508- ; GFX942-NEXT: v_accvgpr_write_b32 a16, s1
1509- ; GFX942-NEXT: v_accvgpr_write_b32 a15, s1
1510- ; GFX942-NEXT: v_accvgpr_write_b32 a14, s1
1511- ; GFX942-NEXT: v_accvgpr_write_b32 a13, s1
1512- ; GFX942-NEXT: v_accvgpr_write_b32 a12, s1
1513- ; GFX942-NEXT: v_accvgpr_write_b32 a11, s1
1514- ; GFX942-NEXT: v_accvgpr_write_b32 a10, s1
1515- ; GFX942-NEXT: v_accvgpr_write_b32 a9, s1
1516- ; GFX942-NEXT: v_accvgpr_write_b32 a8, s1
1517- ; GFX942-NEXT: v_accvgpr_write_b32 a7, s1
1518- ; GFX942-NEXT: v_accvgpr_write_b32 a6, s1
1519- ; GFX942-NEXT: v_accvgpr_write_b32 a5, s1
1520- ; GFX942-NEXT: v_accvgpr_write_b32 a4, s1
1521- ; GFX942-NEXT: v_accvgpr_write_b32 a3, s1
1522- ; GFX942-NEXT: v_accvgpr_write_b32 a2, s1
1523- ; GFX942-NEXT: v_accvgpr_write_b32 a1, s1
1524- ; GFX942-NEXT: v_accvgpr_write_b32 a0, s1
1493+ ; GFX942-NEXT: v_mov_b32_e32 v0, s1
1494+ ; GFX942-NEXT: v_accvgpr_write_b32 a31, v0
1495+ ; GFX942-NEXT: v_accvgpr_write_b32 a30, v0
1496+ ; GFX942-NEXT: v_accvgpr_write_b32 a29, v0
1497+ ; GFX942-NEXT: v_accvgpr_write_b32 a28, v0
1498+ ; GFX942-NEXT: v_accvgpr_write_b32 a27, v0
1499+ ; GFX942-NEXT: v_accvgpr_write_b32 a26, v0
1500+ ; GFX942-NEXT: v_accvgpr_write_b32 a25, v0
1501+ ; GFX942-NEXT: v_accvgpr_write_b32 a24, v0
1502+ ; GFX942-NEXT: v_accvgpr_write_b32 a23, v0
1503+ ; GFX942-NEXT: v_accvgpr_write_b32 a22, v0
1504+ ; GFX942-NEXT: v_accvgpr_write_b32 a21, v0
1505+ ; GFX942-NEXT: v_accvgpr_write_b32 a20, v0
1506+ ; GFX942-NEXT: v_accvgpr_write_b32 a19, v0
1507+ ; GFX942-NEXT: v_accvgpr_write_b32 a18, v0
1508+ ; GFX942-NEXT: v_accvgpr_write_b32 a17, v0
1509+ ; GFX942-NEXT: v_accvgpr_write_b32 a16, v0
1510+ ; GFX942-NEXT: v_accvgpr_write_b32 a15, v0
1511+ ; GFX942-NEXT: v_accvgpr_write_b32 a14, v0
1512+ ; GFX942-NEXT: v_accvgpr_write_b32 a13, v0
1513+ ; GFX942-NEXT: v_accvgpr_write_b32 a12, v0
1514+ ; GFX942-NEXT: v_accvgpr_write_b32 a11, v0
1515+ ; GFX942-NEXT: v_accvgpr_write_b32 a10, v0
1516+ ; GFX942-NEXT: v_accvgpr_write_b32 a9, v0
1517+ ; GFX942-NEXT: v_accvgpr_write_b32 a8, v0
1518+ ; GFX942-NEXT: v_accvgpr_write_b32 a7, v0
1519+ ; GFX942-NEXT: v_accvgpr_write_b32 a6, v0
1520+ ; GFX942-NEXT: v_accvgpr_write_b32 a5, v0
1521+ ; GFX942-NEXT: v_accvgpr_write_b32 a4, v0
1522+ ; GFX942-NEXT: v_accvgpr_write_b32 a3, v0
1523+ ; GFX942-NEXT: v_accvgpr_write_b32 a2, v0
1524+ ; GFX942-NEXT: v_accvgpr_write_b32 a1, v0
1525+ ; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
1526+ ; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
15251527; GFX942-NEXT: .LBB5_1: ; %for.cond.preheader
15261528; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
15271529; GFX942-NEXT: s_nop 1
@@ -1696,6 +1698,8 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
16961698; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
16971699; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0
16981700; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0
1701+ ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1702+ ; GFX90A-NEXT: v_mov_b32_e32 v0, s1
16991703; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0
17001704; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0
17011705; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0
@@ -1725,8 +1729,7 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
17251729; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0
17261730; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0
17271731; GFX90A-NEXT: s_mov_b32 s0, 16
1728- ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1729- ; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1
1732+ ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0
17301733; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
17311734; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
17321735; GFX90A-NEXT: .LBB6_1: ; %for.cond.preheader
@@ -1759,6 +1762,8 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
17591762; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
17601763; GFX942-NEXT: v_accvgpr_write_b32 a31, 0
17611764; GFX942-NEXT: v_accvgpr_write_b32 a30, 0
1765+ ; GFX942-NEXT: s_waitcnt lgkmcnt(0)
1766+ ; GFX942-NEXT: v_mov_b32_e32 v0, s1
17621767; GFX942-NEXT: v_accvgpr_write_b32 a29, 0
17631768; GFX942-NEXT: v_accvgpr_write_b32 a28, 0
17641769; GFX942-NEXT: v_accvgpr_write_b32 a27, 0
@@ -1788,8 +1793,7 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
17881793; GFX942-NEXT: v_accvgpr_write_b32 a3, 0
17891794; GFX942-NEXT: v_accvgpr_write_b32 a2, 0
17901795; GFX942-NEXT: s_mov_b32 s0, 16
1791- ; GFX942-NEXT: s_waitcnt lgkmcnt(0)
1792- ; GFX942-NEXT: v_accvgpr_write_b32 a1, s1
1796+ ; GFX942-NEXT: v_accvgpr_write_b32 a1, v0
17931797; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
17941798; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
17951799; GFX942-NEXT: .LBB6_1: ; %for.cond.preheader
@@ -2050,66 +2054,38 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
20502054; GFX908-NEXT: s_nop 7
20512055; GFX908-NEXT: s_nop 1
20522056; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
2053- ; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
2054- ; GFX908-NEXT: v_accvgpr_read_b32 v33, a0
2057+ ; GFX908-NEXT: s_nop 1
2058+ ; GFX908-NEXT: v_accvgpr_write_b32 a0, v2
20552059; GFX908-NEXT: v_accvgpr_write_b32 a1, v2
2056- ; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
2057- ; GFX908-NEXT: v_accvgpr_write_b32 a2, v3
2058- ; GFX908-NEXT: v_accvgpr_write_b32 a3, v33
2060+ ; GFX908-NEXT: v_accvgpr_write_b32 a2, v2
2061+ ; GFX908-NEXT: v_accvgpr_write_b32 a3, v2
20592062; GFX908-NEXT: v_accvgpr_write_b32 a4, v2
2060- ; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
2061- ; GFX908-NEXT: v_accvgpr_read_b32 v33, a0
2062- ; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
2063- ; GFX908-NEXT: v_accvgpr_write_b32 a5, v3
2064- ; GFX908-NEXT: v_accvgpr_write_b32 a6, v33
2063+ ; GFX908-NEXT: v_accvgpr_write_b32 a5, v2
2064+ ; GFX908-NEXT: v_accvgpr_write_b32 a6, v2
20652065; GFX908-NEXT: v_accvgpr_write_b32 a7, v2
2066- ; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
2067- ; GFX908-NEXT: v_accvgpr_read_b32 v33, a0
2068- ; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
2069- ; GFX908-NEXT: v_accvgpr_write_b32 a8, v3
2070- ; GFX908-NEXT: v_accvgpr_write_b32 a9, v33
2066+ ; GFX908-NEXT: v_accvgpr_write_b32 a8, v2
2067+ ; GFX908-NEXT: v_accvgpr_write_b32 a9, v2
20712068; GFX908-NEXT: v_accvgpr_write_b32 a10, v2
2072- ; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
2073- ; GFX908-NEXT: v_accvgpr_read_b32 v33, a0
2074- ; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
2075- ; GFX908-NEXT: v_accvgpr_write_b32 a11, v3
2076- ; GFX908-NEXT: v_accvgpr_write_b32 a12, v33
2069+ ; GFX908-NEXT: v_accvgpr_write_b32 a11, v2
2070+ ; GFX908-NEXT: v_accvgpr_write_b32 a12, v2
20772071; GFX908-NEXT: v_accvgpr_write_b32 a13, v2
2078- ; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
2079- ; GFX908-NEXT: v_accvgpr_read_b32 v33, a0
2080- ; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
2081- ; GFX908-NEXT: v_accvgpr_write_b32 a14, v3
2082- ; GFX908-NEXT: v_accvgpr_write_b32 a15, v33
2072+ ; GFX908-NEXT: v_accvgpr_write_b32 a14, v2
2073+ ; GFX908-NEXT: v_accvgpr_write_b32 a15, v2
20832074; GFX908-NEXT: v_accvgpr_write_b32 a16, v2
2084- ; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
2085- ; GFX908-NEXT: v_accvgpr_read_b32 v33, a0
2086- ; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
2087- ; GFX908-NEXT: v_accvgpr_write_b32 a17, v3
2088- ; GFX908-NEXT: v_accvgpr_write_b32 a18, v33
2075+ ; GFX908-NEXT: v_accvgpr_write_b32 a17, v2
2076+ ; GFX908-NEXT: v_accvgpr_write_b32 a18, v2
20892077; GFX908-NEXT: v_accvgpr_write_b32 a19, v2
2090- ; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
2091- ; GFX908-NEXT: v_accvgpr_read_b32 v33, a0
2092- ; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
2093- ; GFX908-NEXT: v_accvgpr_write_b32 a20, v3
2094- ; GFX908-NEXT: v_accvgpr_write_b32 a21, v33
2078+ ; GFX908-NEXT: v_accvgpr_write_b32 a20, v2
2079+ ; GFX908-NEXT: v_accvgpr_write_b32 a21, v2
20952080; GFX908-NEXT: v_accvgpr_write_b32 a22, v2
2096- ; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
2097- ; GFX908-NEXT: v_accvgpr_read_b32 v33, a0
2098- ; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
2099- ; GFX908-NEXT: v_accvgpr_write_b32 a23, v3
2100- ; GFX908-NEXT: v_accvgpr_write_b32 a24, v33
2081+ ; GFX908-NEXT: v_accvgpr_write_b32 a23, v2
2082+ ; GFX908-NEXT: v_accvgpr_write_b32 a24, v2
21012083; GFX908-NEXT: v_accvgpr_write_b32 a25, v2
2102- ; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
2103- ; GFX908-NEXT: v_accvgpr_read_b32 v33, a0
2104- ; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
2105- ; GFX908-NEXT: v_accvgpr_write_b32 a26, v3
2106- ; GFX908-NEXT: v_accvgpr_write_b32 a27, v33
2084+ ; GFX908-NEXT: v_accvgpr_write_b32 a26, v2
2085+ ; GFX908-NEXT: v_accvgpr_write_b32 a27, v2
21072086; GFX908-NEXT: v_accvgpr_write_b32 a28, v2
2108- ; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
2109- ; GFX908-NEXT: v_accvgpr_read_b32 v33, a0
2110- ; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
2111- ; GFX908-NEXT: v_accvgpr_write_b32 a29, v3
2112- ; GFX908-NEXT: v_accvgpr_write_b32 a30, v33
2087+ ; GFX908-NEXT: v_accvgpr_write_b32 a29, v2
2088+ ; GFX908-NEXT: v_accvgpr_write_b32 a30, v2
21132089; GFX908-NEXT: v_accvgpr_write_b32 a31, v2
21142090; GFX908-NEXT: .LBB8_1: ; %for.cond.preheader
21152091; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
0 commit comments