diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index ab507e3714ebb..13f1ab5483a21 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -1347,6 +1347,7 @@ The AMDGPU backend implements the following LLVM IR intrinsics. - 0x0100: All DS read instructions may be scheduled accoss sched_barrier. - 0x0200: All DS write instructions may be scheduled across sched_barrier. - 0x0400: All Transcendental (e.g. V_EXP) instructions may be scheduled across sched_barrier. + - 0x0800: All Packed Arithmetic (e.g. V_PK_MOV, V_DOT, etc) instructions may be scheduled across sched_barrier. llvm.amdgcn.sched.group.barrier Creates schedule groups with specific properties to create custom scheduling pipelines. The ordering between groups is enforced by the instruction scheduler. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp index bbd262748d680..b8ee898b2058e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -75,8 +75,9 @@ enum class SchedGroupMask { DS_READ = 1u << 8, DS_WRITE = 1u << 9, TRANS = 1u << 10, + PACK = 1u << 11, ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS | - DS_READ | DS_WRITE | TRANS, + DS_READ | DS_WRITE | TRANS | PACK, LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) }; @@ -2414,7 +2415,8 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const { Result = true; else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) && - TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI)) + TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI) + && !TII->isVOP3P(MI)) Result = true; else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) && @@ -2455,6 +2457,10 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const { TII->isTRANS(MI)) Result = true; + else if (((SGMask & SchedGroupMask::PACK) != SchedGroupMask::NONE) && + TII->isVOP3P(MI) && !TII->isMFMAorWMMA(MI)) + Result = true; + LLVM_DEBUG( dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true) << (Result ? " could classify " : " unable to classify ") << MI); @@ -2634,15 +2640,17 @@ IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const { // allowed past the SCHED_BARRIER. SchedGroupMask InvertedMask = ~Mask; - // ALU implies VALU, SALU, MFMA, TRANS. + // ALU implies VALU, SALU, MFMA, TRANS, PACK. if ((InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE) InvertedMask &= ~SchedGroupMask::VALU & ~SchedGroupMask::SALU & - ~SchedGroupMask::MFMA & ~SchedGroupMask::TRANS; - // VALU, SALU, MFMA, TRANS implies ALU. + ~SchedGroupMask::MFMA & ~SchedGroupMask::TRANS & + ~SchedGroupMask::PACK; + // VALU, SALU, MFMA, TRANS, PACK implies ALU. else if ((InvertedMask & SchedGroupMask::VALU) == SchedGroupMask::NONE || (InvertedMask & SchedGroupMask::SALU) == SchedGroupMask::NONE || (InvertedMask & SchedGroupMask::MFMA) == SchedGroupMask::NONE || - (InvertedMask & SchedGroupMask::TRANS) == SchedGroupMask::NONE) + (InvertedMask & SchedGroupMask::TRANS) == SchedGroupMask::NONE || + (InvertedMask & SchedGroupMask::PACK) == SchedGroupMask::NONE) InvertedMask &= ~SchedGroupMask::ALU; // VMEM implies VMEM_READ, VMEM_WRITE. diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll index 37f335561a52c..ab1d6ca5f243f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll @@ -1625,6 +1625,189 @@ entry: ret void } + +define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_PACK_MFMA(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out, ptr addrspace(3) noalias %in1) #0 { +; GCN-LABEL: test_sched_group_barrier_pipeline_interleave_PACK_MFMA: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: v_and_b32_e32 v6, 0x3ff, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 7, v6 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_add_u32_e32 v7, s0, v8 +; GCN-NEXT: s_movk_i32 s0, 0xff88 +; GCN-NEXT: v_mad_i32_i24 v9, v6, s0, v7 +; GCN-NEXT: ds_read2st64_b64 v[0:3], v9 offset1:1 +; GCN-NEXT: ds_read_b64 v[4:5], v9 offset:5120 +; GCN-NEXT: ds_read_b128 a[28:31], v7 offset:112 +; GCN-NEXT: ds_read_b128 a[24:27], v7 offset:96 +; GCN-NEXT: ds_read_b128 a[20:23], v7 offset:80 +; GCN-NEXT: ds_read_b128 a[16:19], v7 offset:64 +; GCN-NEXT: ds_read_b128 a[0:3], v7 +; GCN-NEXT: ds_read_b128 a[4:7], v7 offset:16 +; GCN-NEXT: ds_read_b128 a[8:11], v7 offset:32 +; GCN-NEXT: ds_read_b128 a[12:15], v7 offset:48 +; GCN-NEXT: s_waitcnt lgkmcnt(8) +; GCN-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[4:5], 0 op_sel_hi:[1,1,0] +; GCN-NEXT: v_add_u32_e32 v4, 0xc00, v9 +; GCN-NEXT: v_lshl_add_u32 v10, v6, 3, v4 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GCN-NEXT: ds_read2st64_b64 v[4:7], v10 offset0:4 offset1:5 +; GCN-NEXT: ; sched_group_barrier mask(0x00000800) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f32 v[4:5], v[2:3], v[4:5], v[0:1] +; GCN-NEXT: ds_read2st64_b64 v[0:3], v9 offset0:3 offset1:6 +; GCN-NEXT: ; sched_group_barrier mask(0x00000800) size(1) SyncID(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v4, v5, a[0:31] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[6:7], v[4:5] +; GCN-NEXT: ds_read_b64 v[4:5], v10 offset:3584 +; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000800) size(1) SyncID(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f32 v[0:1], v[2:3], v[4:5], v[0:1] +; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000800) size(1) SyncID(0) +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GCN-NEXT: v_add_u32_e32 v0, s1, v8 +; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112 +; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:96 +; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:80 +; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:64 +; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:48 +; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32 +; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16 +; GCN-NEXT: ds_write_b128 v0, a[0:3] +; GCN-NEXT: s_endpgm +; +; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_interleave_PACK_MFMA: +; EXACTCUTOFF: ; %bb.0: ; %entry +; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; EXACTCUTOFF-NEXT: v_and_b32_e32 v6, 0x3ff, v0 +; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v8, 7, v6 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) +; EXACTCUTOFF-NEXT: v_add_u32_e32 v7, s0, v8 +; EXACTCUTOFF-NEXT: s_movk_i32 s0, 0xff88 +; EXACTCUTOFF-NEXT: v_mad_i32_i24 v9, v6, s0, v7 +; EXACTCUTOFF-NEXT: ds_read2st64_b64 v[0:3], v9 offset1:1 +; EXACTCUTOFF-NEXT: ds_read_b64 v[4:5], v9 offset:5120 +; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v7 offset:112 +; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v7 offset:96 +; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v7 offset:80 +; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v7 offset:64 +; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v7 +; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v7 offset:16 +; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v7 offset:32 +; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v7 offset:48 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(8) +; EXACTCUTOFF-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[4:5], 0 op_sel_hi:[1,1,0] +; EXACTCUTOFF-NEXT: v_add_u32_e32 v4, 0xc00, v9 +; EXACTCUTOFF-NEXT: v_lshl_add_u32 v10, v6, 3, v4 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; EXACTCUTOFF-NEXT: ds_read2st64_b64 v[4:7], v10 offset0:4 offset1:5 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000800) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) +; EXACTCUTOFF-NEXT: v_pk_fma_f32 v[4:5], v[2:3], v[4:5], v[0:1] +; EXACTCUTOFF-NEXT: ds_read2st64_b64 v[0:3], v9 offset0:3 offset1:6 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000800) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: s_nop 0 +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v4, v5, a[0:31] +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) +; EXACTCUTOFF-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[6:7], v[4:5] +; EXACTCUTOFF-NEXT: ds_read_b64 v[4:5], v10 offset:3584 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000800) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: s_nop 0 +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) +; EXACTCUTOFF-NEXT: v_pk_fma_f32 v[0:1], v[2:3], v[4:5], v[0:1] +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000800) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: s_nop 1 +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s1, v8 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: s_nop 7 +; EXACTCUTOFF-NEXT: s_nop 7 +; EXACTCUTOFF-NEXT: s_nop 1 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:112 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:96 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:80 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:64 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:48 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:32 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:16 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] +; EXACTCUTOFF-NEXT: s_endpgm +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %load.0.addr = getelementptr <32 x float>, ptr addrspace(3) %in, i32 %idx + %load.0 = load <32 x float>, ptr addrspace(3) %load.0.addr + %el.0.addr = getelementptr <2 x float>, ptr addrspace(3) %in, i32 %idx + %el.0 = load <2 x float>, ptr addrspace(3) %el.0.addr + %el.1.addr = getelementptr <2 x float>, ptr addrspace(3) %el.0.addr, i32 64 + %el.1 = load <2 x float>, ptr addrspace(3) %el.1.addr + %el.2.addr = getelementptr <2 x float>, ptr addrspace(3) %el.1.addr, i32 128 + %el.2 = load <2 x float>, ptr addrspace(3) %el.2.addr + %el.3.addr = getelementptr <2 x float>, ptr addrspace(3) %el.2.addr, i32 192 + %el.3 = load <2 x float>, ptr addrspace(3) %el.3.addr + %el.4.addr = getelementptr <2 x float>, ptr addrspace(3) %el.3.addr, i32 256 + %el.4 = load <2 x float>, ptr addrspace(3) %el.4.addr + %el.5.addr = getelementptr <2 x float>, ptr addrspace(3) %el.4.addr, i32 %idx + %el.5 = load <2 x float>, ptr addrspace(3) %el.5.addr + %el.6.addr = getelementptr <2 x float>, ptr addrspace(3) %el.5.addr, i32 64 + %el.6 = load <2 x float>, ptr addrspace(3) %el.6.addr + %el.7.addr = getelementptr <2 x float>, ptr addrspace(3) %el.6.addr, i32 128 + %el.7 = load <2 x float>, ptr addrspace(3) %el.7.addr + %v0 = tail call contract <2 x float> @llvm.fma.v2f32(<2 x float> %el.0, <2 x float> %el.4, <2 x float> ) + %v1 = tail call contract <2 x float> @llvm.fma.v2f32(<2 x float> %el.1, <2 x float> %el.5, <2 x float> %v0) + %v2 = tail call contract <2 x float> @llvm.fma.v2f32(<2 x float> %el.2, <2 x float> %el.6, <2 x float> %v1) + %v3 = tail call contract <2 x float> @llvm.fma.v2f32(<2 x float> %el.3, <2 x float> %el.7, <2 x float> %v2) + %op0 = extractelement <2 x float> %v0, i32 0 + %op1 = extractelement <2 x float> %v0, i32 1 + %op2 = extractelement <2 x float> %v1, i32 0 + %op3 = extractelement <2 x float> %v1, i32 1 + %op4 = extractelement <2 x float> %v2, i32 0 + %op5 = extractelement <2 x float> %v2, i32 1 + %op6 = extractelement <2 x float> %v3, i32 0 + %op7 = extractelement <2 x float> %v3, i32 1 + %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float %op0, float %op1, <32 x float> %load.0, i32 0, i32 0, i32 0) + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float %op2, float %op3, <32 x float> %mai.0, i32 0, i32 0, i32 0) + %mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float %op4, float %op5, <32 x float> %mai.1, i32 0, i32 0, i32 0) + %mai.3 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float %op6, float %op7, <32 x float> %mai.2, i32 0, i32 0, i32 0) + ; 1 PACK + call void @llvm.amdgcn.sched.group.barrier(i32 2048, i32 1, i32 0) + ; 1 MFMA + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + ; 1 PACK + call void @llvm.amdgcn.sched.group.barrier(i32 2048, i32 1, i32 0) + ; 1 MFMA + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + ; 1 PACK + call void @llvm.amdgcn.sched.group.barrier(i32 2048, i32 1, i32 0) + ; 1 MFMA + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + ; 1 PACK + call void @llvm.amdgcn.sched.group.barrier(i32 2048, i32 1, i32 0) + ; 1 MFMA + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + %store.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 %idx + store <32 x float> %mai.3, ptr addrspace(3) %store.addr + ret void +} + + declare i32 @llvm.amdgcn.workitem.id.x() #2 declare void @llvm.amdgcn.sched.group.barrier(i32, i32, i32) #1 declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) #1 diff --git a/llvm/test/CodeGen/AMDGPU/sched.barrier.inverted.mask.ll b/llvm/test/CodeGen/AMDGPU/sched.barrier.inverted.mask.ll index c20dbba42ccd4..14762dc329d9d 100644 --- a/llvm/test/CodeGen/AMDGPU/sched.barrier.inverted.mask.ll +++ b/llvm/test/CodeGen/AMDGPU/sched.barrier.inverted.mask.ll @@ -5,7 +5,7 @@ -; Inverted 1008: 01111110000 +; Inverted 1008: 001111110000 ; GCN: After Inverting, SchedGroup Mask: 1008 define amdgpu_kernel void @invert1() #0 { entry: @@ -14,8 +14,8 @@ entry: ret void } -; Inverted 2044: 11111111100 -; GCN: After Inverting, SchedGroup Mask: 2044 +; Inverted 4092: 111111111100 +; GCN: After Inverting, SchedGroup Mask: 4092 define amdgpu_kernel void @invert2() #0 { entry: call void @llvm.amdgcn.sched.barrier(i32 2) #1 @@ -23,8 +23,8 @@ entry: ret void } -; Inverted 2042: 11111111010 -; GCN: After Inverting, SchedGroup Mask: 2042 +; Inverted 4090: 111111111010 +; GCN: After Inverting, SchedGroup Mask: 4090 define amdgpu_kernel void @invert4() #0 { entry: call void @llvm.amdgcn.sched.barrier(i32 4) #1 @@ -32,8 +32,8 @@ entry: ret void } -; Inverted 2038: 11111110110 -; GCN: After Inverting, SchedGroup Mask: 2038 +; Inverted 4086: 111111110110 +; GCN: After Inverting, SchedGroup Mask: 4086 define amdgpu_kernel void @invert8() #0 { entry: call void @llvm.amdgcn.sched.barrier(i32 8) #1 @@ -41,8 +41,8 @@ entry: ret void } -; Inverted 1935: 11110001111 -; GCN: After Inverting, SchedGroup Mask: 1935 +; Inverted 3983: 111110001111 +; GCN: After Inverting, SchedGroup Mask: 3983 define amdgpu_kernel void @invert16() #0 { entry: call void @llvm.amdgcn.sched.barrier(i32 16) #1 @@ -50,8 +50,8 @@ entry: ret void } -; Inverted 1999: 11111001111 -; GCN: After Inverting, SchedGroup Mask: 1999 +; Inverted 4047: 111111001111 +; GCN: After Inverting, SchedGroup Mask: 4047 define amdgpu_kernel void @invert32() #0 { entry: call void @llvm.amdgcn.sched.barrier(i32 32) #1 @@ -59,8 +59,8 @@ entry: ret void } -; Inverted 1967: 11110101111 -; GCN: After Inverting, SchedGroup Mask: 1967 +; Inverted 4015: 111110101111 +; GCN: After Inverting, SchedGroup Mask: 4015 define amdgpu_kernel void @invert64() #0 { entry: call void @llvm.amdgcn.sched.barrier(i32 64) #1 @@ -68,8 +68,8 @@ entry: ret void } -; Inverted 1151: 10001111111 -; GCN: After Inverting, SchedGroup Mask: 1151 +; Inverted 3199: 110001111111 +; GCN: After Inverting, SchedGroup Mask: 3199 define amdgpu_kernel void @invert128() #0 { entry: call void @llvm.amdgcn.sched.barrier(i32 128) #1 @@ -77,8 +77,8 @@ entry: ret void } -; Inverted 1663: 11001111111 -; GCN: After Inverting, SchedGroup Mask: 1663 +; Inverted 3711: 111001111111 +; GCN: After Inverting, SchedGroup Mask: 3711 define amdgpu_kernel void @invert256() #0 { entry: call void @llvm.amdgcn.sched.barrier(i32 256) #1 @@ -86,8 +86,8 @@ entry: ret void } -; Inverted 1407: 10101111111 -; GCN: After Inverting, SchedGroup Mask: 1407 +; Inverted 3455: 110101111111 +; GCN: After Inverting, SchedGroup Mask: 3455 define amdgpu_kernel void @invert512() #0 { entry: call void @llvm.amdgcn.sched.barrier(i32 512) #1 @@ -95,8 +95,8 @@ entry: ret void } -; Inverted 1022: 01111111110 -; GCN: After Inverting, SchedGroup Mask: 1022 +; Inverted 3070: 101111111110 +; GCN: After Inverting, SchedGroup Mask: 3070 define amdgpu_kernel void @invert1024() #0 { entry: call void @llvm.amdgcn.sched.barrier(i32 1024) #1 @@ -104,6 +104,16 @@ entry: ret void } +; Inverted 2046: 011111111110 +; GCN: After Inverting, SchedGroup Mask: 2046 +define amdgpu_kernel void @invert2048() #0 { +entry: + call void @llvm.amdgcn.sched.barrier(i32 2048) #1 + call void @llvm.amdcn.s.nop(i16 0) #1 + ret void +} + + declare void @llvm.amdgcn.sched.barrier(i32) #1 declare void @llvm.amdcn.s.nop(i16) #1