@@ -1625,6 +1625,189 @@ entry:
16251625 ret void
16261626}
16271627
1628+
1629+ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_PACK_MFMA (ptr addrspace (3 ) noalias %in , ptr addrspace (3 ) noalias %out , ptr addrspace (3 ) noalias %in1 ) #0 {
1630+ ; GCN-LABEL: test_sched_group_barrier_pipeline_interleave_PACK_MFMA:
1631+ ; GCN: ; %bb.0: ; %entry
1632+ ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
1633+ ; GCN-NEXT: v_and_b32_e32 v6, 0x3ff, v0
1634+ ; GCN-NEXT: v_lshlrev_b32_e32 v8, 7, v6
1635+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
1636+ ; GCN-NEXT: v_add_u32_e32 v7, s0, v8
1637+ ; GCN-NEXT: s_movk_i32 s0, 0xff88
1638+ ; GCN-NEXT: v_mad_i32_i24 v9, v6, s0, v7
1639+ ; GCN-NEXT: ds_read2st64_b64 v[0:3], v9 offset1:1
1640+ ; GCN-NEXT: ds_read_b64 v[4:5], v9 offset:5120
1641+ ; GCN-NEXT: ds_read_b128 a[28:31], v7 offset:112
1642+ ; GCN-NEXT: ds_read_b128 a[24:27], v7 offset:96
1643+ ; GCN-NEXT: ds_read_b128 a[20:23], v7 offset:80
1644+ ; GCN-NEXT: ds_read_b128 a[16:19], v7 offset:64
1645+ ; GCN-NEXT: ds_read_b128 a[0:3], v7
1646+ ; GCN-NEXT: ds_read_b128 a[4:7], v7 offset:16
1647+ ; GCN-NEXT: ds_read_b128 a[8:11], v7 offset:32
1648+ ; GCN-NEXT: ds_read_b128 a[12:15], v7 offset:48
1649+ ; GCN-NEXT: s_waitcnt lgkmcnt(8)
1650+ ; GCN-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[4:5], 0 op_sel_hi:[1,1,0]
1651+ ; GCN-NEXT: v_add_u32_e32 v4, 0xc00, v9
1652+ ; GCN-NEXT: v_lshl_add_u32 v10, v6, 3, v4
1653+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
1654+ ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
1655+ ; GCN-NEXT: ds_read2st64_b64 v[4:7], v10 offset0:4 offset1:5
1656+ ; GCN-NEXT: ; sched_group_barrier mask(0x00000800) size(1) SyncID(0)
1657+ ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
1658+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
1659+ ; GCN-NEXT: v_pk_fma_f32 v[4:5], v[2:3], v[4:5], v[0:1]
1660+ ; GCN-NEXT: ds_read2st64_b64 v[0:3], v9 offset0:3 offset1:6
1661+ ; GCN-NEXT: ; sched_group_barrier mask(0x00000800) size(1) SyncID(0)
1662+ ; GCN-NEXT: s_nop 0
1663+ ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v4, v5, a[0:31]
1664+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
1665+ ; GCN-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[6:7], v[4:5]
1666+ ; GCN-NEXT: ds_read_b64 v[4:5], v10 offset:3584
1667+ ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
1668+ ; GCN-NEXT: ; sched_group_barrier mask(0x00000800) size(1) SyncID(0)
1669+ ; GCN-NEXT: s_nop 0
1670+ ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
1671+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
1672+ ; GCN-NEXT: v_pk_fma_f32 v[0:1], v[2:3], v[4:5], v[0:1]
1673+ ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
1674+ ; GCN-NEXT: ; sched_group_barrier mask(0x00000800) size(1) SyncID(0)
1675+ ; GCN-NEXT: s_nop 1
1676+ ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
1677+ ; GCN-NEXT: v_add_u32_e32 v0, s1, v8
1678+ ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
1679+ ; GCN-NEXT: s_nop 7
1680+ ; GCN-NEXT: s_nop 7
1681+ ; GCN-NEXT: s_nop 1
1682+ ; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112
1683+ ; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:96
1684+ ; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:80
1685+ ; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:64
1686+ ; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:48
1687+ ; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32
1688+ ; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16
1689+ ; GCN-NEXT: ds_write_b128 v0, a[0:3]
1690+ ; GCN-NEXT: s_endpgm
1691+ ;
1692+ ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_interleave_PACK_MFMA:
1693+ ; EXACTCUTOFF: ; %bb.0: ; %entry
1694+ ; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
1695+ ; EXACTCUTOFF-NEXT: v_and_b32_e32 v6, 0x3ff, v0
1696+ ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v8, 7, v6
1697+ ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
1698+ ; EXACTCUTOFF-NEXT: v_add_u32_e32 v7, s0, v8
1699+ ; EXACTCUTOFF-NEXT: s_movk_i32 s0, 0xff88
1700+ ; EXACTCUTOFF-NEXT: v_mad_i32_i24 v9, v6, s0, v7
1701+ ; EXACTCUTOFF-NEXT: ds_read2st64_b64 v[0:3], v9 offset1:1
1702+ ; EXACTCUTOFF-NEXT: ds_read_b64 v[4:5], v9 offset:5120
1703+ ; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v7 offset:112
1704+ ; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v7 offset:96
1705+ ; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v7 offset:80
1706+ ; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v7 offset:64
1707+ ; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v7
1708+ ; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v7 offset:16
1709+ ; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v7 offset:32
1710+ ; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v7 offset:48
1711+ ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(8)
1712+ ; EXACTCUTOFF-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[4:5], 0 op_sel_hi:[1,1,0]
1713+ ; EXACTCUTOFF-NEXT: v_add_u32_e32 v4, 0xc00, v9
1714+ ; EXACTCUTOFF-NEXT: v_lshl_add_u32 v10, v6, 3, v4
1715+ ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
1716+ ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
1717+ ; EXACTCUTOFF-NEXT: ds_read2st64_b64 v[4:7], v10 offset0:4 offset1:5
1718+ ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000800) size(1) SyncID(0)
1719+ ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
1720+ ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
1721+ ; EXACTCUTOFF-NEXT: v_pk_fma_f32 v[4:5], v[2:3], v[4:5], v[0:1]
1722+ ; EXACTCUTOFF-NEXT: ds_read2st64_b64 v[0:3], v9 offset0:3 offset1:6
1723+ ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000800) size(1) SyncID(0)
1724+ ; EXACTCUTOFF-NEXT: s_nop 0
1725+ ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v4, v5, a[0:31]
1726+ ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
1727+ ; EXACTCUTOFF-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[6:7], v[4:5]
1728+ ; EXACTCUTOFF-NEXT: ds_read_b64 v[4:5], v10 offset:3584
1729+ ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
1730+ ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000800) size(1) SyncID(0)
1731+ ; EXACTCUTOFF-NEXT: s_nop 0
1732+ ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
1733+ ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
1734+ ; EXACTCUTOFF-NEXT: v_pk_fma_f32 v[0:1], v[2:3], v[4:5], v[0:1]
1735+ ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
1736+ ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000800) size(1) SyncID(0)
1737+ ; EXACTCUTOFF-NEXT: s_nop 1
1738+ ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
1739+ ; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s1, v8
1740+ ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
1741+ ; EXACTCUTOFF-NEXT: s_nop 7
1742+ ; EXACTCUTOFF-NEXT: s_nop 7
1743+ ; EXACTCUTOFF-NEXT: s_nop 1
1744+ ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:112
1745+ ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:96
1746+ ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:80
1747+ ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:64
1748+ ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:48
1749+ ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:32
1750+ ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:16
1751+ ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3]
1752+ ; EXACTCUTOFF-NEXT: s_endpgm
1753+ entry:
1754+ %idx = call i32 @llvm.amdgcn.workitem.id.x ()
1755+ %load.0.addr = getelementptr <32 x float >, ptr addrspace (3 ) %in , i32 %idx
1756+ %load.0 = load <32 x float >, ptr addrspace (3 ) %load.0.addr
1757+ %el.0.addr = getelementptr <2 x float >, ptr addrspace (3 ) %in , i32 %idx
1758+ %el.0 = load <2 x float >, ptr addrspace (3 ) %el.0.addr
1759+ %el.1.addr = getelementptr <2 x float >, ptr addrspace (3 ) %el.0.addr , i32 64
1760+ %el.1 = load <2 x float >, ptr addrspace (3 ) %el.1.addr
1761+ %el.2.addr = getelementptr <2 x float >, ptr addrspace (3 ) %el.1.addr , i32 128
1762+ %el.2 = load <2 x float >, ptr addrspace (3 ) %el.2.addr
1763+ %el.3.addr = getelementptr <2 x float >, ptr addrspace (3 ) %el.2.addr , i32 192
1764+ %el.3 = load <2 x float >, ptr addrspace (3 ) %el.3.addr
1765+ %el.4.addr = getelementptr <2 x float >, ptr addrspace (3 ) %el.3.addr , i32 256
1766+ %el.4 = load <2 x float >, ptr addrspace (3 ) %el.4.addr
1767+ %el.5.addr = getelementptr <2 x float >, ptr addrspace (3 ) %el.4.addr , i32 %idx
1768+ %el.5 = load <2 x float >, ptr addrspace (3 ) %el.5.addr
1769+ %el.6.addr = getelementptr <2 x float >, ptr addrspace (3 ) %el.5.addr , i32 64
1770+ %el.6 = load <2 x float >, ptr addrspace (3 ) %el.6.addr
1771+ %el.7.addr = getelementptr <2 x float >, ptr addrspace (3 ) %el.6.addr , i32 128
1772+ %el.7 = load <2 x float >, ptr addrspace (3 ) %el.7.addr
1773+ %v0 = tail call contract <2 x float > @llvm.fma.v2f32 (<2 x float > %el.0 , <2 x float > %el.4 , <2 x float > <float 0 .0 , float 0 .0 >)
1774+ %v1 = tail call contract <2 x float > @llvm.fma.v2f32 (<2 x float > %el.1 , <2 x float > %el.5 , <2 x float > %v0 )
1775+ %v2 = tail call contract <2 x float > @llvm.fma.v2f32 (<2 x float > %el.2 , <2 x float > %el.6 , <2 x float > %v1 )
1776+ %v3 = tail call contract <2 x float > @llvm.fma.v2f32 (<2 x float > %el.3 , <2 x float > %el.7 , <2 x float > %v2 )
1777+ %op0 = extractelement <2 x float > %v0 , i32 0
1778+ %op1 = extractelement <2 x float > %v0 , i32 1
1779+ %op2 = extractelement <2 x float > %v1 , i32 0
1780+ %op3 = extractelement <2 x float > %v1 , i32 1
1781+ %op4 = extractelement <2 x float > %v2 , i32 0
1782+ %op5 = extractelement <2 x float > %v2 , i32 1
1783+ %op6 = extractelement <2 x float > %v3 , i32 0
1784+ %op7 = extractelement <2 x float > %v3 , i32 1
1785+ %mai.0 = tail call <32 x float > @llvm.amdgcn.mfma.f32.32x32x1f32 (float %op0 , float %op1 , <32 x float > %load.0 , i32 0 , i32 0 , i32 0 )
1786+ %mai.1 = tail call <32 x float > @llvm.amdgcn.mfma.f32.32x32x1f32 (float %op2 , float %op3 , <32 x float > %mai.0 , i32 0 , i32 0 , i32 0 )
1787+ %mai.2 = tail call <32 x float > @llvm.amdgcn.mfma.f32.32x32x1f32 (float %op4 , float %op5 , <32 x float > %mai.1 , i32 0 , i32 0 , i32 0 )
1788+ %mai.3 = tail call <32 x float > @llvm.amdgcn.mfma.f32.32x32x1f32 (float %op6 , float %op7 , <32 x float > %mai.2 , i32 0 , i32 0 , i32 0 )
1789+ ; 1 PACK
1790+ call void @llvm.amdgcn.sched.group.barrier (i32 2048 , i32 1 , i32 0 )
1791+ ; 1 MFMA
1792+ call void @llvm.amdgcn.sched.group.barrier (i32 8 , i32 1 , i32 0 )
1793+ ; 1 PACK
1794+ call void @llvm.amdgcn.sched.group.barrier (i32 2048 , i32 1 , i32 0 )
1795+ ; 1 MFMA
1796+ call void @llvm.amdgcn.sched.group.barrier (i32 8 , i32 1 , i32 0 )
1797+ ; 1 PACK
1798+ call void @llvm.amdgcn.sched.group.barrier (i32 2048 , i32 1 , i32 0 )
1799+ ; 1 MFMA
1800+ call void @llvm.amdgcn.sched.group.barrier (i32 8 , i32 1 , i32 0 )
1801+ ; 1 PACK
1802+ call void @llvm.amdgcn.sched.group.barrier (i32 2048 , i32 1 , i32 0 )
1803+ ; 1 MFMA
1804+ call void @llvm.amdgcn.sched.group.barrier (i32 8 , i32 1 , i32 0 )
1805+ %store.addr = getelementptr <32 x float >, ptr addrspace (3 ) %out , i32 %idx
1806+ store <32 x float > %mai.3 , ptr addrspace (3 ) %store.addr
1807+ ret void
1808+ }
1809+
1810+
16281811declare i32 @llvm.amdgcn.workitem.id.x () #2
16291812declare void @llvm.amdgcn.sched.group.barrier (i32 , i32 , i32 ) #1
16301813declare <32 x float > @llvm.amdgcn.mfma.f32.32x32x1f32 (float , float , <32 x float >, i32 , i32 , i32 ) #1
0 commit comments