Skip to content

Commit ff952a1

Browse files
committed
[AMDGPU] Add SchedGroupBarrier::PACK for packed math
Change-Id: I3f87324db3c085b46b2c37c8eb1a5b30f29f6858
1 parent 77ac5a2 commit ff952a1

File tree

4 files changed

+229
-27
lines changed

4 files changed

+229
-27
lines changed

llvm/docs/AMDGPUUsage.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1347,6 +1347,7 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
13471347
- 0x0100: All DS read instructions may be scheduled accoss sched_barrier.
13481348
- 0x0200: All DS write instructions may be scheduled across sched_barrier.
13491349
- 0x0400: All Transcendental (e.g. V_EXP) instructions may be scheduled across sched_barrier.
1350+
- 0x0800: All Packed Arithmetic (e.g. V_PK_MOV, V_DOT, etc) instructions may be scheduled across sched_barrier.
13501351

13511352
llvm.amdgcn.sched.group.barrier Creates schedule groups with specific properties to create custom scheduling
13521353
pipelines. The ordering between groups is enforced by the instruction scheduler.

llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -75,8 +75,9 @@ enum class SchedGroupMask {
7575
DS_READ = 1u << 8,
7676
DS_WRITE = 1u << 9,
7777
TRANS = 1u << 10,
78+
PACK = 1u << 11,
7879
ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS |
79-
DS_READ | DS_WRITE | TRANS,
80+
DS_READ | DS_WRITE | TRANS | PACK,
8081
LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
8182
};
8283

@@ -2414,7 +2415,8 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
24142415
Result = true;
24152416

24162417
else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) &&
2417-
TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI))
2418+
TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI)
2419+
&& !TII->isVOP3P(MI))
24182420
Result = true;
24192421

24202422
else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) &&
@@ -2455,6 +2457,10 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
24552457
TII->isTRANS(MI))
24562458
Result = true;
24572459

2460+
else if (((SGMask & SchedGroupMask::PACK) != SchedGroupMask::NONE) &&
2461+
TII->isVOP3P(MI) && !TII->isMFMAorWMMA(MI))
2462+
Result = true;
2463+
24582464
LLVM_DEBUG(
24592465
dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true)
24602466
<< (Result ? " could classify " : " unable to classify ") << MI);
@@ -2634,15 +2640,17 @@ IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const {
26342640
// allowed past the SCHED_BARRIER.
26352641
SchedGroupMask InvertedMask = ~Mask;
26362642

2637-
// ALU implies VALU, SALU, MFMA, TRANS.
2643+
// ALU implies VALU, SALU, MFMA, TRANS, PACK.
26382644
if ((InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE)
26392645
InvertedMask &= ~SchedGroupMask::VALU & ~SchedGroupMask::SALU &
2640-
~SchedGroupMask::MFMA & ~SchedGroupMask::TRANS;
2641-
// VALU, SALU, MFMA, TRANS implies ALU.
2646+
~SchedGroupMask::MFMA & ~SchedGroupMask::TRANS &
2647+
~SchedGroupMask::PACK;
2648+
// VALU, SALU, MFMA, TRANS, PACK implies ALU.
26422649
else if ((InvertedMask & SchedGroupMask::VALU) == SchedGroupMask::NONE ||
26432650
(InvertedMask & SchedGroupMask::SALU) == SchedGroupMask::NONE ||
26442651
(InvertedMask & SchedGroupMask::MFMA) == SchedGroupMask::NONE ||
2645-
(InvertedMask & SchedGroupMask::TRANS) == SchedGroupMask::NONE)
2652+
(InvertedMask & SchedGroupMask::TRANS) == SchedGroupMask::NONE ||
2653+
(InvertedMask & SchedGroupMask::PACK) == SchedGroupMask::NONE)
26462654
InvertedMask &= ~SchedGroupMask::ALU;
26472655

26482656
// VMEM implies VMEM_READ, VMEM_WRITE.

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll

Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1625,6 +1625,189 @@ entry:
16251625
ret void
16261626
}
16271627

1628+
1629+
define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_PACK_MFMA(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out, ptr addrspace(3) noalias %in1) #0 {
1630+
; GCN-LABEL: test_sched_group_barrier_pipeline_interleave_PACK_MFMA:
1631+
; GCN: ; %bb.0: ; %entry
1632+
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
1633+
; GCN-NEXT: v_and_b32_e32 v6, 0x3ff, v0
1634+
; GCN-NEXT: v_lshlrev_b32_e32 v8, 7, v6
1635+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
1636+
; GCN-NEXT: v_add_u32_e32 v7, s0, v8
1637+
; GCN-NEXT: s_movk_i32 s0, 0xff88
1638+
; GCN-NEXT: v_mad_i32_i24 v9, v6, s0, v7
1639+
; GCN-NEXT: ds_read2st64_b64 v[0:3], v9 offset1:1
1640+
; GCN-NEXT: ds_read_b64 v[4:5], v9 offset:5120
1641+
; GCN-NEXT: ds_read_b128 a[28:31], v7 offset:112
1642+
; GCN-NEXT: ds_read_b128 a[24:27], v7 offset:96
1643+
; GCN-NEXT: ds_read_b128 a[20:23], v7 offset:80
1644+
; GCN-NEXT: ds_read_b128 a[16:19], v7 offset:64
1645+
; GCN-NEXT: ds_read_b128 a[0:3], v7
1646+
; GCN-NEXT: ds_read_b128 a[4:7], v7 offset:16
1647+
; GCN-NEXT: ds_read_b128 a[8:11], v7 offset:32
1648+
; GCN-NEXT: ds_read_b128 a[12:15], v7 offset:48
1649+
; GCN-NEXT: s_waitcnt lgkmcnt(8)
1650+
; GCN-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[4:5], 0 op_sel_hi:[1,1,0]
1651+
; GCN-NEXT: v_add_u32_e32 v4, 0xc00, v9
1652+
; GCN-NEXT: v_lshl_add_u32 v10, v6, 3, v4
1653+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
1654+
; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
1655+
; GCN-NEXT: ds_read2st64_b64 v[4:7], v10 offset0:4 offset1:5
1656+
; GCN-NEXT: ; sched_group_barrier mask(0x00000800) size(1) SyncID(0)
1657+
; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
1658+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
1659+
; GCN-NEXT: v_pk_fma_f32 v[4:5], v[2:3], v[4:5], v[0:1]
1660+
; GCN-NEXT: ds_read2st64_b64 v[0:3], v9 offset0:3 offset1:6
1661+
; GCN-NEXT: ; sched_group_barrier mask(0x00000800) size(1) SyncID(0)
1662+
; GCN-NEXT: s_nop 0
1663+
; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v4, v5, a[0:31]
1664+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
1665+
; GCN-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[6:7], v[4:5]
1666+
; GCN-NEXT: ds_read_b64 v[4:5], v10 offset:3584
1667+
; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
1668+
; GCN-NEXT: ; sched_group_barrier mask(0x00000800) size(1) SyncID(0)
1669+
; GCN-NEXT: s_nop 0
1670+
; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
1671+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
1672+
; GCN-NEXT: v_pk_fma_f32 v[0:1], v[2:3], v[4:5], v[0:1]
1673+
; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
1674+
; GCN-NEXT: ; sched_group_barrier mask(0x00000800) size(1) SyncID(0)
1675+
; GCN-NEXT: s_nop 1
1676+
; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
1677+
; GCN-NEXT: v_add_u32_e32 v0, s1, v8
1678+
; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
1679+
; GCN-NEXT: s_nop 7
1680+
; GCN-NEXT: s_nop 7
1681+
; GCN-NEXT: s_nop 1
1682+
; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112
1683+
; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:96
1684+
; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:80
1685+
; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:64
1686+
; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:48
1687+
; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32
1688+
; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16
1689+
; GCN-NEXT: ds_write_b128 v0, a[0:3]
1690+
; GCN-NEXT: s_endpgm
1691+
;
1692+
; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_interleave_PACK_MFMA:
1693+
; EXACTCUTOFF: ; %bb.0: ; %entry
1694+
; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
1695+
; EXACTCUTOFF-NEXT: v_and_b32_e32 v6, 0x3ff, v0
1696+
; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v8, 7, v6
1697+
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
1698+
; EXACTCUTOFF-NEXT: v_add_u32_e32 v7, s0, v8
1699+
; EXACTCUTOFF-NEXT: s_movk_i32 s0, 0xff88
1700+
; EXACTCUTOFF-NEXT: v_mad_i32_i24 v9, v6, s0, v7
1701+
; EXACTCUTOFF-NEXT: ds_read2st64_b64 v[0:3], v9 offset1:1
1702+
; EXACTCUTOFF-NEXT: ds_read_b64 v[4:5], v9 offset:5120
1703+
; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v7 offset:112
1704+
; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v7 offset:96
1705+
; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v7 offset:80
1706+
; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v7 offset:64
1707+
; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v7
1708+
; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v7 offset:16
1709+
; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v7 offset:32
1710+
; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v7 offset:48
1711+
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(8)
1712+
; EXACTCUTOFF-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[4:5], 0 op_sel_hi:[1,1,0]
1713+
; EXACTCUTOFF-NEXT: v_add_u32_e32 v4, 0xc00, v9
1714+
; EXACTCUTOFF-NEXT: v_lshl_add_u32 v10, v6, 3, v4
1715+
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
1716+
; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
1717+
; EXACTCUTOFF-NEXT: ds_read2st64_b64 v[4:7], v10 offset0:4 offset1:5
1718+
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000800) size(1) SyncID(0)
1719+
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
1720+
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
1721+
; EXACTCUTOFF-NEXT: v_pk_fma_f32 v[4:5], v[2:3], v[4:5], v[0:1]
1722+
; EXACTCUTOFF-NEXT: ds_read2st64_b64 v[0:3], v9 offset0:3 offset1:6
1723+
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000800) size(1) SyncID(0)
1724+
; EXACTCUTOFF-NEXT: s_nop 0
1725+
; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v4, v5, a[0:31]
1726+
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
1727+
; EXACTCUTOFF-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[6:7], v[4:5]
1728+
; EXACTCUTOFF-NEXT: ds_read_b64 v[4:5], v10 offset:3584
1729+
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
1730+
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000800) size(1) SyncID(0)
1731+
; EXACTCUTOFF-NEXT: s_nop 0
1732+
; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
1733+
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
1734+
; EXACTCUTOFF-NEXT: v_pk_fma_f32 v[0:1], v[2:3], v[4:5], v[0:1]
1735+
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
1736+
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000800) size(1) SyncID(0)
1737+
; EXACTCUTOFF-NEXT: s_nop 1
1738+
; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
1739+
; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s1, v8
1740+
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
1741+
; EXACTCUTOFF-NEXT: s_nop 7
1742+
; EXACTCUTOFF-NEXT: s_nop 7
1743+
; EXACTCUTOFF-NEXT: s_nop 1
1744+
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:112
1745+
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:96
1746+
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:80
1747+
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:64
1748+
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:48
1749+
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:32
1750+
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:16
1751+
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3]
1752+
; EXACTCUTOFF-NEXT: s_endpgm
1753+
entry:
1754+
%idx = call i32 @llvm.amdgcn.workitem.id.x()
1755+
%load.0.addr = getelementptr <32 x float>, ptr addrspace(3) %in, i32 %idx
1756+
%load.0 = load <32 x float>, ptr addrspace(3) %load.0.addr
1757+
%el.0.addr = getelementptr <2 x float>, ptr addrspace(3) %in, i32 %idx
1758+
%el.0 = load <2 x float>, ptr addrspace(3) %el.0.addr
1759+
%el.1.addr = getelementptr <2 x float>, ptr addrspace(3) %el.0.addr, i32 64
1760+
%el.1 = load <2 x float>, ptr addrspace(3) %el.1.addr
1761+
%el.2.addr = getelementptr <2 x float>, ptr addrspace(3) %el.1.addr, i32 128
1762+
%el.2 = load <2 x float>, ptr addrspace(3) %el.2.addr
1763+
%el.3.addr = getelementptr <2 x float>, ptr addrspace(3) %el.2.addr, i32 192
1764+
%el.3 = load <2 x float>, ptr addrspace(3) %el.3.addr
1765+
%el.4.addr = getelementptr <2 x float>, ptr addrspace(3) %el.3.addr, i32 256
1766+
%el.4 = load <2 x float>, ptr addrspace(3) %el.4.addr
1767+
%el.5.addr = getelementptr <2 x float>, ptr addrspace(3) %el.4.addr, i32 %idx
1768+
%el.5 = load <2 x float>, ptr addrspace(3) %el.5.addr
1769+
%el.6.addr = getelementptr <2 x float>, ptr addrspace(3) %el.5.addr, i32 64
1770+
%el.6 = load <2 x float>, ptr addrspace(3) %el.6.addr
1771+
%el.7.addr = getelementptr <2 x float>, ptr addrspace(3) %el.6.addr, i32 128
1772+
%el.7 = load <2 x float>, ptr addrspace(3) %el.7.addr
1773+
%v0 = tail call contract <2 x float> @llvm.fma.v2f32(<2 x float> %el.0, <2 x float> %el.4, <2 x float> <float 0.0, float 0.0>)
1774+
%v1 = tail call contract <2 x float> @llvm.fma.v2f32(<2 x float> %el.1, <2 x float> %el.5, <2 x float> %v0)
1775+
%v2 = tail call contract <2 x float> @llvm.fma.v2f32(<2 x float> %el.2, <2 x float> %el.6, <2 x float> %v1)
1776+
%v3 = tail call contract <2 x float> @llvm.fma.v2f32(<2 x float> %el.3, <2 x float> %el.7, <2 x float> %v2)
1777+
%op0 = extractelement <2 x float> %v0, i32 0
1778+
%op1 = extractelement <2 x float> %v0, i32 1
1779+
%op2 = extractelement <2 x float> %v1, i32 0
1780+
%op3 = extractelement <2 x float> %v1, i32 1
1781+
%op4 = extractelement <2 x float> %v2, i32 0
1782+
%op5 = extractelement <2 x float> %v2, i32 1
1783+
%op6 = extractelement <2 x float> %v3, i32 0
1784+
%op7 = extractelement <2 x float> %v3, i32 1
1785+
%mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float %op0, float %op1, <32 x float> %load.0, i32 0, i32 0, i32 0)
1786+
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float %op2, float %op3, <32 x float> %mai.0, i32 0, i32 0, i32 0)
1787+
%mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float %op4, float %op5, <32 x float> %mai.1, i32 0, i32 0, i32 0)
1788+
%mai.3 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float %op6, float %op7, <32 x float> %mai.2, i32 0, i32 0, i32 0)
1789+
; 1 PACK
1790+
call void @llvm.amdgcn.sched.group.barrier(i32 2048, i32 1, i32 0)
1791+
; 1 MFMA
1792+
call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0)
1793+
; 1 PACK
1794+
call void @llvm.amdgcn.sched.group.barrier(i32 2048, i32 1, i32 0)
1795+
; 1 MFMA
1796+
call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0)
1797+
; 1 PACK
1798+
call void @llvm.amdgcn.sched.group.barrier(i32 2048, i32 1, i32 0)
1799+
; 1 MFMA
1800+
call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0)
1801+
; 1 PACK
1802+
call void @llvm.amdgcn.sched.group.barrier(i32 2048, i32 1, i32 0)
1803+
; 1 MFMA
1804+
call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0)
1805+
%store.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 %idx
1806+
store <32 x float> %mai.3, ptr addrspace(3) %store.addr
1807+
ret void
1808+
}
1809+
1810+
16281811
declare i32 @llvm.amdgcn.workitem.id.x() #2
16291812
declare void @llvm.amdgcn.sched.group.barrier(i32, i32, i32) #1
16301813
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) #1

llvm/test/CodeGen/AMDGPU/sched.barrier.inverted.mask.ll

Lines changed: 31 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66

77

8-
; Inverted 1008: 01111110000
8+
; Inverted 1008: 001111110000
99
; GCN: After Inverting, SchedGroup Mask: 1008
1010
define amdgpu_kernel void @invert1() #0 {
1111
entry:
@@ -14,96 +14,106 @@ entry:
1414
ret void
1515
}
1616

17-
; Inverted 2044: 11111111100
18-
; GCN: After Inverting, SchedGroup Mask: 2044
17+
; Inverted 4092: 111111111100
18+
; GCN: After Inverting, SchedGroup Mask: 4092
1919
define amdgpu_kernel void @invert2() #0 {
2020
entry:
2121
call void @llvm.amdgcn.sched.barrier(i32 2) #1
2222
call void @llvm.amdcn.s.nop(i16 0) #1
2323
ret void
2424
}
2525

26-
; Inverted 2042: 11111111010
27-
; GCN: After Inverting, SchedGroup Mask: 2042
26+
; Inverted 4090: 111111111010
27+
; GCN: After Inverting, SchedGroup Mask: 4090
2828
define amdgpu_kernel void @invert4() #0 {
2929
entry:
3030
call void @llvm.amdgcn.sched.barrier(i32 4) #1
3131
call void @llvm.amdcn.s.nop(i16 0) #1
3232
ret void
3333
}
3434

35-
; Inverted 2038: 11111110110
36-
; GCN: After Inverting, SchedGroup Mask: 2038
35+
; Inverted 4086: 111111110110
36+
; GCN: After Inverting, SchedGroup Mask: 4086
3737
define amdgpu_kernel void @invert8() #0 {
3838
entry:
3939
call void @llvm.amdgcn.sched.barrier(i32 8) #1
4040
call void @llvm.amdcn.s.nop(i16 0) #1
4141
ret void
4242
}
4343

44-
; Inverted 1935: 11110001111
45-
; GCN: After Inverting, SchedGroup Mask: 1935
44+
; Inverted 3983: 111110001111
45+
; GCN: After Inverting, SchedGroup Mask: 3983
4646
define amdgpu_kernel void @invert16() #0 {
4747
entry:
4848
call void @llvm.amdgcn.sched.barrier(i32 16) #1
4949
call void @llvm.amdcn.s.nop(i16 0) #1
5050
ret void
5151
}
5252

53-
; Inverted 1999: 11111001111
54-
; GCN: After Inverting, SchedGroup Mask: 1999
53+
; Inverted 4047: 111111001111
54+
; GCN: After Inverting, SchedGroup Mask: 4047
5555
define amdgpu_kernel void @invert32() #0 {
5656
entry:
5757
call void @llvm.amdgcn.sched.barrier(i32 32) #1
5858
call void @llvm.amdcn.s.nop(i16 0) #1
5959
ret void
6060
}
6161

62-
; Inverted 1967: 11110101111
63-
; GCN: After Inverting, SchedGroup Mask: 1967
62+
; Inverted 4015: 111110101111
63+
; GCN: After Inverting, SchedGroup Mask: 4015
6464
define amdgpu_kernel void @invert64() #0 {
6565
entry:
6666
call void @llvm.amdgcn.sched.barrier(i32 64) #1
6767
call void @llvm.amdcn.s.nop(i16 0) #1
6868
ret void
6969
}
7070

71-
; Inverted 1151: 10001111111
72-
; GCN: After Inverting, SchedGroup Mask: 1151
71+
; Inverted 3199: 110001111111
72+
; GCN: After Inverting, SchedGroup Mask: 3199
7373
define amdgpu_kernel void @invert128() #0 {
7474
entry:
7575
call void @llvm.amdgcn.sched.barrier(i32 128) #1
7676
call void @llvm.amdcn.s.nop(i16 0) #1
7777
ret void
7878
}
7979

80-
; Inverted 1663: 11001111111
81-
; GCN: After Inverting, SchedGroup Mask: 1663
80+
; Inverted 3711: 111001111111
81+
; GCN: After Inverting, SchedGroup Mask: 3711
8282
define amdgpu_kernel void @invert256() #0 {
8383
entry:
8484
call void @llvm.amdgcn.sched.barrier(i32 256) #1
8585
call void @llvm.amdcn.s.nop(i16 0) #1
8686
ret void
8787
}
8888

89-
; Inverted 1407: 10101111111
90-
; GCN: After Inverting, SchedGroup Mask: 1407
89+
; Inverted 3455: 110101111111
90+
; GCN: After Inverting, SchedGroup Mask: 3455
9191
define amdgpu_kernel void @invert512() #0 {
9292
entry:
9393
call void @llvm.amdgcn.sched.barrier(i32 512) #1
9494
call void @llvm.amdcn.s.nop(i16 0) #1
9595
ret void
9696
}
9797

98-
; Inverted 1022: 01111111110
99-
; GCN: After Inverting, SchedGroup Mask: 1022
98+
; Inverted 3070: 101111111110
99+
; GCN: After Inverting, SchedGroup Mask: 3070
100100
define amdgpu_kernel void @invert1024() #0 {
101101
entry:
102102
call void @llvm.amdgcn.sched.barrier(i32 1024) #1
103103
call void @llvm.amdcn.s.nop(i16 0) #1
104104
ret void
105105
}
106106

107+
; Inverted 2046: 011111111110
108+
; GCN: After Inverting, SchedGroup Mask: 2046
109+
define amdgpu_kernel void @invert2048() #0 {
110+
entry:
111+
call void @llvm.amdgcn.sched.barrier(i32 2048) #1
112+
call void @llvm.amdcn.s.nop(i16 0) #1
113+
ret void
114+
}
115+
116+
107117
declare void @llvm.amdgcn.sched.barrier(i32) #1
108118
declare void @llvm.amdcn.s.nop(i16) #1
109119

0 commit comments

Comments
 (0)