Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions llvm/docs/AMDGPUUsage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1347,6 +1347,7 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
- 0x0100: All DS read instructions may be scheduled accoss sched_barrier.
- 0x0200: All DS write instructions may be scheduled across sched_barrier.
- 0x0400: All Transcendental (e.g. V_EXP) instructions may be scheduled across sched_barrier.
- 0x0800: All Packed Arithmetic (e.g. V_PK_MOV, V_DOT, etc) instructions may be scheduled across sched_barrier.

llvm.amdgcn.sched.group.barrier Creates schedule groups with specific properties to create custom scheduling
pipelines. The ordering between groups is enforced by the instruction scheduler.
Expand Down
20 changes: 14 additions & 6 deletions llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,9 @@ enum class SchedGroupMask {
DS_READ = 1u << 8,
DS_WRITE = 1u << 9,
TRANS = 1u << 10,
PACK = 1u << 11,
ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS |
DS_READ | DS_WRITE | TRANS,
DS_READ | DS_WRITE | TRANS | PACK,
LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
};

Expand Down Expand Up @@ -2414,7 +2415,8 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
Result = true;

else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) &&
TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI))
TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI)
&& !TII->isVOP3P(MI))
Result = true;

else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) &&
Expand Down Expand Up @@ -2455,6 +2457,10 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
TII->isTRANS(MI))
Result = true;

else if (((SGMask & SchedGroupMask::PACK) != SchedGroupMask::NONE) &&
TII->isVOP3P(MI) && !TII->isMFMAorWMMA(MI))
Result = true;

LLVM_DEBUG(
dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true)
<< (Result ? " could classify " : " unable to classify ") << MI);
Expand Down Expand Up @@ -2634,15 +2640,17 @@ IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const {
// allowed past the SCHED_BARRIER.
SchedGroupMask InvertedMask = ~Mask;

// ALU implies VALU, SALU, MFMA, TRANS.
// ALU implies VALU, SALU, MFMA, TRANS, PACK.
if ((InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE)
InvertedMask &= ~SchedGroupMask::VALU & ~SchedGroupMask::SALU &
~SchedGroupMask::MFMA & ~SchedGroupMask::TRANS;
// VALU, SALU, MFMA, TRANS implies ALU.
~SchedGroupMask::MFMA & ~SchedGroupMask::TRANS &
~SchedGroupMask::PACK;
// VALU, SALU, MFMA, TRANS, PACK implies ALU.
else if ((InvertedMask & SchedGroupMask::VALU) == SchedGroupMask::NONE ||
(InvertedMask & SchedGroupMask::SALU) == SchedGroupMask::NONE ||
(InvertedMask & SchedGroupMask::MFMA) == SchedGroupMask::NONE ||
(InvertedMask & SchedGroupMask::TRANS) == SchedGroupMask::NONE)
(InvertedMask & SchedGroupMask::TRANS) == SchedGroupMask::NONE ||
(InvertedMask & SchedGroupMask::PACK) == SchedGroupMask::NONE)
InvertedMask &= ~SchedGroupMask::ALU;

// VMEM implies VMEM_READ, VMEM_WRITE.
Expand Down
183 changes: 183 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1625,6 +1625,189 @@ entry:
ret void
}


define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_PACK_MFMA(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out, ptr addrspace(3) noalias %in1) #0 {
; GCN-LABEL: test_sched_group_barrier_pipeline_interleave_PACK_MFMA:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_and_b32_e32 v6, 0x3ff, v0
; GCN-NEXT: v_lshlrev_b32_e32 v8, 7, v6
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_add_u32_e32 v7, s0, v8
; GCN-NEXT: s_movk_i32 s0, 0xff88
; GCN-NEXT: v_mad_i32_i24 v9, v6, s0, v7
; GCN-NEXT: ds_read2st64_b64 v[0:3], v9 offset1:1
; GCN-NEXT: ds_read_b64 v[4:5], v9 offset:5120
; GCN-NEXT: ds_read_b128 a[28:31], v7 offset:112
; GCN-NEXT: ds_read_b128 a[24:27], v7 offset:96
; GCN-NEXT: ds_read_b128 a[20:23], v7 offset:80
; GCN-NEXT: ds_read_b128 a[16:19], v7 offset:64
; GCN-NEXT: ds_read_b128 a[0:3], v7
; GCN-NEXT: ds_read_b128 a[4:7], v7 offset:16
; GCN-NEXT: ds_read_b128 a[8:11], v7 offset:32
; GCN-NEXT: ds_read_b128 a[12:15], v7 offset:48
; GCN-NEXT: s_waitcnt lgkmcnt(8)
; GCN-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[4:5], 0 op_sel_hi:[1,1,0]
; GCN-NEXT: v_add_u32_e32 v4, 0xc00, v9
; GCN-NEXT: v_lshl_add_u32 v10, v6, 3, v4
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
; GCN-NEXT: ds_read2st64_b64 v[4:7], v10 offset0:4 offset1:5
; GCN-NEXT: ; sched_group_barrier mask(0x00000800) size(1) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_pk_fma_f32 v[4:5], v[2:3], v[4:5], v[0:1]
; GCN-NEXT: ds_read2st64_b64 v[0:3], v9 offset0:3 offset1:6
; GCN-NEXT: ; sched_group_barrier mask(0x00000800) size(1) SyncID(0)
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v4, v5, a[0:31]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[6:7], v[4:5]
; GCN-NEXT: ds_read_b64 v[4:5], v10 offset:3584
; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000800) size(1) SyncID(0)
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_pk_fma_f32 v[0:1], v[2:3], v[4:5], v[0:1]
; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000800) size(1) SyncID(0)
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
; GCN-NEXT: v_add_u32_e32 v0, s1, v8
; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 1
; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112
; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:96
; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:80
; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:64
; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:48
; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32
; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16
; GCN-NEXT: ds_write_b128 v0, a[0:3]
; GCN-NEXT: s_endpgm
;
; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_interleave_PACK_MFMA:
; EXACTCUTOFF: ; %bb.0: ; %entry
; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; EXACTCUTOFF-NEXT: v_and_b32_e32 v6, 0x3ff, v0
; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v8, 7, v6
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
; EXACTCUTOFF-NEXT: v_add_u32_e32 v7, s0, v8
; EXACTCUTOFF-NEXT: s_movk_i32 s0, 0xff88
; EXACTCUTOFF-NEXT: v_mad_i32_i24 v9, v6, s0, v7
; EXACTCUTOFF-NEXT: ds_read2st64_b64 v[0:3], v9 offset1:1
; EXACTCUTOFF-NEXT: ds_read_b64 v[4:5], v9 offset:5120
; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v7 offset:112
; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v7 offset:96
; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v7 offset:80
; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v7 offset:64
; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v7
; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v7 offset:16
; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v7 offset:32
; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v7 offset:48
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(8)
; EXACTCUTOFF-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[4:5], 0 op_sel_hi:[1,1,0]
; EXACTCUTOFF-NEXT: v_add_u32_e32 v4, 0xc00, v9
; EXACTCUTOFF-NEXT: v_lshl_add_u32 v10, v6, 3, v4
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
; EXACTCUTOFF-NEXT: ds_read2st64_b64 v[4:7], v10 offset0:4 offset1:5
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000800) size(1) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
; EXACTCUTOFF-NEXT: v_pk_fma_f32 v[4:5], v[2:3], v[4:5], v[0:1]
; EXACTCUTOFF-NEXT: ds_read2st64_b64 v[0:3], v9 offset0:3 offset1:6
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000800) size(1) SyncID(0)
; EXACTCUTOFF-NEXT: s_nop 0
; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v4, v5, a[0:31]
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
; EXACTCUTOFF-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[6:7], v[4:5]
; EXACTCUTOFF-NEXT: ds_read_b64 v[4:5], v10 offset:3584
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000800) size(1) SyncID(0)
; EXACTCUTOFF-NEXT: s_nop 0
; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
; EXACTCUTOFF-NEXT: v_pk_fma_f32 v[0:1], v[2:3], v[4:5], v[0:1]
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000800) size(1) SyncID(0)
; EXACTCUTOFF-NEXT: s_nop 1
; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s1, v8
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; EXACTCUTOFF-NEXT: s_nop 7
; EXACTCUTOFF-NEXT: s_nop 7
; EXACTCUTOFF-NEXT: s_nop 1
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:112
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:96
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:80
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:64
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:48
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:32
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:16
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3]
; EXACTCUTOFF-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
%load.0.addr = getelementptr <32 x float>, ptr addrspace(3) %in, i32 %idx
%load.0 = load <32 x float>, ptr addrspace(3) %load.0.addr
%el.0.addr = getelementptr <2 x float>, ptr addrspace(3) %in, i32 %idx
%el.0 = load <2 x float>, ptr addrspace(3) %el.0.addr
%el.1.addr = getelementptr <2 x float>, ptr addrspace(3) %el.0.addr, i32 64
%el.1 = load <2 x float>, ptr addrspace(3) %el.1.addr
%el.2.addr = getelementptr <2 x float>, ptr addrspace(3) %el.1.addr, i32 128
%el.2 = load <2 x float>, ptr addrspace(3) %el.2.addr
%el.3.addr = getelementptr <2 x float>, ptr addrspace(3) %el.2.addr, i32 192
%el.3 = load <2 x float>, ptr addrspace(3) %el.3.addr
%el.4.addr = getelementptr <2 x float>, ptr addrspace(3) %el.3.addr, i32 256
%el.4 = load <2 x float>, ptr addrspace(3) %el.4.addr
%el.5.addr = getelementptr <2 x float>, ptr addrspace(3) %el.4.addr, i32 %idx
%el.5 = load <2 x float>, ptr addrspace(3) %el.5.addr
%el.6.addr = getelementptr <2 x float>, ptr addrspace(3) %el.5.addr, i32 64
%el.6 = load <2 x float>, ptr addrspace(3) %el.6.addr
%el.7.addr = getelementptr <2 x float>, ptr addrspace(3) %el.6.addr, i32 128
%el.7 = load <2 x float>, ptr addrspace(3) %el.7.addr
%v0 = tail call contract <2 x float> @llvm.fma.v2f32(<2 x float> %el.0, <2 x float> %el.4, <2 x float> <float 0.0, float 0.0>)
%v1 = tail call contract <2 x float> @llvm.fma.v2f32(<2 x float> %el.1, <2 x float> %el.5, <2 x float> %v0)
%v2 = tail call contract <2 x float> @llvm.fma.v2f32(<2 x float> %el.2, <2 x float> %el.6, <2 x float> %v1)
%v3 = tail call contract <2 x float> @llvm.fma.v2f32(<2 x float> %el.3, <2 x float> %el.7, <2 x float> %v2)
%op0 = extractelement <2 x float> %v0, i32 0
%op1 = extractelement <2 x float> %v0, i32 1
%op2 = extractelement <2 x float> %v1, i32 0
%op3 = extractelement <2 x float> %v1, i32 1
%op4 = extractelement <2 x float> %v2, i32 0
%op5 = extractelement <2 x float> %v2, i32 1
%op6 = extractelement <2 x float> %v3, i32 0
%op7 = extractelement <2 x float> %v3, i32 1
%mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float %op0, float %op1, <32 x float> %load.0, i32 0, i32 0, i32 0)
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float %op2, float %op3, <32 x float> %mai.0, i32 0, i32 0, i32 0)
%mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float %op4, float %op5, <32 x float> %mai.1, i32 0, i32 0, i32 0)
%mai.3 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float %op6, float %op7, <32 x float> %mai.2, i32 0, i32 0, i32 0)
; 1 PACK
call void @llvm.amdgcn.sched.group.barrier(i32 2048, i32 1, i32 0)
; 1 MFMA
call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0)
; 1 PACK
call void @llvm.amdgcn.sched.group.barrier(i32 2048, i32 1, i32 0)
; 1 MFMA
call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0)
; 1 PACK
call void @llvm.amdgcn.sched.group.barrier(i32 2048, i32 1, i32 0)
; 1 MFMA
call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0)
; 1 PACK
call void @llvm.amdgcn.sched.group.barrier(i32 2048, i32 1, i32 0)
; 1 MFMA
call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0)
%store.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 %idx
store <32 x float> %mai.3, ptr addrspace(3) %store.addr
ret void
}


declare i32 @llvm.amdgcn.workitem.id.x() #2
declare void @llvm.amdgcn.sched.group.barrier(i32, i32, i32) #1
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) #1
Expand Down
52 changes: 31 additions & 21 deletions llvm/test/CodeGen/AMDGPU/sched.barrier.inverted.mask.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@



; Inverted 1008: 01111110000
; Inverted 1008: 001111110000
; GCN: After Inverting, SchedGroup Mask: 1008
define amdgpu_kernel void @invert1() #0 {
entry:
Expand All @@ -14,96 +14,106 @@ entry:
ret void
}

; Inverted 2044: 11111111100
; GCN: After Inverting, SchedGroup Mask: 2044
; Inverted 4092: 111111111100
; GCN: After Inverting, SchedGroup Mask: 4092
define amdgpu_kernel void @invert2() #0 {
entry:
call void @llvm.amdgcn.sched.barrier(i32 2) #1
call void @llvm.amdcn.s.nop(i16 0) #1
ret void
}

; Inverted 2042: 11111111010
; GCN: After Inverting, SchedGroup Mask: 2042
; Inverted 4090: 111111111010
; GCN: After Inverting, SchedGroup Mask: 4090
define amdgpu_kernel void @invert4() #0 {
entry:
call void @llvm.amdgcn.sched.barrier(i32 4) #1
call void @llvm.amdcn.s.nop(i16 0) #1
ret void
}

; Inverted 2038: 11111110110
; GCN: After Inverting, SchedGroup Mask: 2038
; Inverted 4086: 111111110110
; GCN: After Inverting, SchedGroup Mask: 4086
define amdgpu_kernel void @invert8() #0 {
entry:
call void @llvm.amdgcn.sched.barrier(i32 8) #1
call void @llvm.amdcn.s.nop(i16 0) #1
ret void
}

; Inverted 1935: 11110001111
; GCN: After Inverting, SchedGroup Mask: 1935
; Inverted 3983: 111110001111
; GCN: After Inverting, SchedGroup Mask: 3983
define amdgpu_kernel void @invert16() #0 {
entry:
call void @llvm.amdgcn.sched.barrier(i32 16) #1
call void @llvm.amdcn.s.nop(i16 0) #1
ret void
}

; Inverted 1999: 11111001111
; GCN: After Inverting, SchedGroup Mask: 1999
; Inverted 4047: 111111001111
; GCN: After Inverting, SchedGroup Mask: 4047
define amdgpu_kernel void @invert32() #0 {
entry:
call void @llvm.amdgcn.sched.barrier(i32 32) #1
call void @llvm.amdcn.s.nop(i16 0) #1
ret void
}

; Inverted 1967: 11110101111
; GCN: After Inverting, SchedGroup Mask: 1967
; Inverted 4015: 111110101111
; GCN: After Inverting, SchedGroup Mask: 4015
define amdgpu_kernel void @invert64() #0 {
entry:
call void @llvm.amdgcn.sched.barrier(i32 64) #1
call void @llvm.amdcn.s.nop(i16 0) #1
ret void
}

; Inverted 1151: 10001111111
; GCN: After Inverting, SchedGroup Mask: 1151
; Inverted 3199: 110001111111
; GCN: After Inverting, SchedGroup Mask: 3199
define amdgpu_kernel void @invert128() #0 {
entry:
call void @llvm.amdgcn.sched.barrier(i32 128) #1
call void @llvm.amdcn.s.nop(i16 0) #1
ret void
}

; Inverted 1663: 11001111111
; GCN: After Inverting, SchedGroup Mask: 1663
; Inverted 3711: 111001111111
; GCN: After Inverting, SchedGroup Mask: 3711
define amdgpu_kernel void @invert256() #0 {
entry:
call void @llvm.amdgcn.sched.barrier(i32 256) #1
call void @llvm.amdcn.s.nop(i16 0) #1
ret void
}

; Inverted 1407: 10101111111
; GCN: After Inverting, SchedGroup Mask: 1407
; Inverted 3455: 110101111111
; GCN: After Inverting, SchedGroup Mask: 3455
define amdgpu_kernel void @invert512() #0 {
entry:
call void @llvm.amdgcn.sched.barrier(i32 512) #1
call void @llvm.amdcn.s.nop(i16 0) #1
ret void
}

; Inverted 1022: 01111111110
; GCN: After Inverting, SchedGroup Mask: 1022
; Inverted 3070: 101111111110
; GCN: After Inverting, SchedGroup Mask: 3070
define amdgpu_kernel void @invert1024() #0 {
entry:
call void @llvm.amdgcn.sched.barrier(i32 1024) #1
call void @llvm.amdcn.s.nop(i16 0) #1
ret void
}

; Inverted 2046: 011111111110
; GCN: After Inverting, SchedGroup Mask: 2046
define amdgpu_kernel void @invert2048() #0 {
entry:
call void @llvm.amdgcn.sched.barrier(i32 2048) #1
call void @llvm.amdcn.s.nop(i16 0) #1
ret void
}


declare void @llvm.amdgcn.sched.barrier(i32) #1
declare void @llvm.amdcn.s.nop(i16) #1

Expand Down
Loading