diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index e062032313058..2406f0b8f8b1f 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -1385,7 +1385,8 @@ The AMDGPU backend implements the following LLVM IR intrinsics. llvm.amdgcn.sched.barrier Controls the types of instructions that may be allowed to cross the intrinsic during instruction scheduling. The parameter is a mask for the instruction types - that can cross the intrinsic. + that can cross the intrinsic. When bits for specific instructions are set, their + more general version (all ALU or all VMEM) is ignored. - 0x0000: No instructions may be scheduled across sched_barrier. - 0x0001: All, non-memory, non-side-effect producing instructions may be diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp index 5700468e2420e..35609a1e38c1b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -2619,8 +2619,14 @@ IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const { // allowed past the SCHED_BARRIER. SchedGroupMask InvertedMask = ~Mask; + // When given specific bits overrule the more general ALU type. + bool HasConcreteALUClassSpecified = + (Mask & (SchedGroupMask::SALU | SchedGroupMask::VALU | + SchedGroupMask::MFMA)) != SchedGroupMask::NONE; + // ALU implies VALU, SALU, MFMA, TRANS. - if ((InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE) + if (!HasConcreteALUClassSpecified && + (InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE) InvertedMask &= ~SchedGroupMask::VALU & ~SchedGroupMask::SALU & ~SchedGroupMask::MFMA & ~SchedGroupMask::TRANS; // VALU, SALU, MFMA, TRANS implies ALU. @@ -2630,8 +2636,15 @@ IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const { (InvertedMask & SchedGroupMask::TRANS) == SchedGroupMask::NONE) InvertedMask &= ~SchedGroupMask::ALU; + // When given specific bits overrule the more general MEM type. + bool HasConcreteMemClassSpecified = + (Mask & (SchedGroupMask::VMEM_READ | SchedGroupMask::VMEM_WRITE | + SchedGroupMask::DS_READ | SchedGroupMask::DS_WRITE)) != + SchedGroupMask::NONE; + // VMEM implies VMEM_READ, VMEM_WRITE. - if ((InvertedMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE) + if (!HasConcreteMemClassSpecified && + (InvertedMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE) InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE; // VMEM_READ, VMEM_WRITE implies VMEM. else if ((InvertedMask & SchedGroupMask::VMEM_READ) == SchedGroupMask::NONE || diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.alu-bit.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.alu-bit.ll new file mode 100644 index 0000000000000..346755eb60fd4 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.alu-bit.ll @@ -0,0 +1,175 @@ +; RUN: llc -mtriple=amdgcn -debug-only=igrouplp < %s 2>&1| FileCheck -check-prefix=GCN %s + +define protected amdgpu_kernel void @sched_barrier_m0(ptr addrspace(3) noalias %ind, ptr addrspace(3) noalias %outd) #0 { +; +; Set mask to 0 (no bits set) +; +; GCN: Applying IGroupLPDAGMutation... +; GCN-NEXT: Applying IGroupLPDAGMutation... +entry: + %arrayidx = getelementptr inbounds float, ptr addrspace(3) %ind, i64 0 + %1 = load float, ptr addrspace(3) %arrayidx, align 4 + call void @llvm.amdgcn.sched.barrier(i32 0) #1 + %add = fadd contract float %1, 1.000000e+00 + %arrayidx3 = getelementptr inbounds float, ptr addrspace(3) %outd, i64 0 + store float %add, ptr addrspace(3) %arrayidx3, align 4 + ret void +} + +define protected amdgpu_kernel void @sched_barrier_m1(ptr addrspace(3) noalias %ind, ptr addrspace(3) noalias %outd) #0 { +; +; Set mask to 1 (ALU Bit, implies all *-ALU bits) +; +; GCN: Applying IGroupLPDAGMutation... +; GCN-NEXT: Building SchedGroup for SchedBarrier with Mask: 1 +; GCN-NEXT: After Inverting, SchedGroup Mask: 1008 +entry: + %arrayidx = getelementptr inbounds float, ptr addrspace(3) %ind, i64 0 + %1 = load float, ptr addrspace(3) %arrayidx, align 4 + call void @llvm.amdgcn.sched.barrier(i32 1) #1 + %add = fadd contract float %1, 1.000000e+00 + %arrayidx3 = getelementptr inbounds float, ptr addrspace(3) %outd, i64 0 + store float %add, ptr addrspace(3) %arrayidx3, align 4 + ret void +} + +define protected amdgpu_kernel void @sched_barrier_m2(ptr addrspace(3) noalias %ind, ptr addrspace(3) noalias %outd) #0 { +; +; Set mask to 2 (VALU Bit) +; +; GCN: Applying IGroupLPDAGMutation... +; GCN-NEXT: Building SchedGroup for SchedBarrier with Mask: 2 +; GCN-NEXT: After Inverting, SchedGroup Mask: 2044 +entry: + %arrayidx = getelementptr inbounds float, ptr addrspace(3) %ind, i64 0 + %1 = load float, ptr addrspace(3) %arrayidx, align 4 + call void @llvm.amdgcn.sched.barrier(i32 2) #1 + %add = fadd contract float %1, 1.000000e+00 + %arrayidx3 = getelementptr inbounds float, ptr addrspace(3) %outd, i64 0 + store float %add, ptr addrspace(3) %arrayidx3, align 4 + ret void +} + +define protected amdgpu_kernel void @sched_barrier_m4(ptr addrspace(3) noalias %ind, ptr addrspace(3) noalias %outd) #0 { +; +; Set mask to 4 (SALU Bit) +; +; GCN: Applying IGroupLPDAGMutation... +; GCN-NEXT: Building SchedGroup for SchedBarrier with Mask: 4 +; GCN-NEXT: After Inverting, SchedGroup Mask: 2042 +entry: + %arrayidx = getelementptr inbounds float, ptr addrspace(3) %ind, i64 0 + %1 = load float, ptr addrspace(3) %arrayidx, align 4 + call void @llvm.amdgcn.sched.barrier(i32 4) #1 + %add = fadd contract float %1, 1.000000e+00 + %arrayidx3 = getelementptr inbounds float, ptr addrspace(3) %outd, i64 0 + store float %add, ptr addrspace(3) %arrayidx3, align 4 + ret void +} + +define protected amdgpu_kernel void @sched_barrier_m8(ptr addrspace(3) noalias %ind, ptr addrspace(3) noalias %outd) #0 { +; +; Set mask to 8 (MFMA Bit) +; +; GCN: Applying IGroupLPDAGMutation... +; GCN-NEXT: Building SchedGroup for SchedBarrier with Mask: 8 +; GCN-NEXT: After Inverting, SchedGroup Mask: 2038 +entry: + %arrayidx = getelementptr inbounds float, ptr addrspace(3) %ind, i64 0 + %1 = load float, ptr addrspace(3) %arrayidx, align 4 + call void @llvm.amdgcn.sched.barrier(i32 8) #1 + %add = fadd contract float %1, 1.000000e+00 + %arrayidx3 = getelementptr inbounds float, ptr addrspace(3) %outd, i64 0 + store float %add, ptr addrspace(3) %arrayidx3, align 4 + ret void +} + +define protected amdgpu_kernel void @sched_barrier_m1024(ptr addrspace(3) noalias %ind, ptr addrspace(3) noalias %outd) #0 { +; +; Set mask to 1024 (TRANS Bit) +; +; GCN: Applying IGroupLPDAGMutation... +; GCN-NEXT: Building SchedGroup for SchedBarrier with Mask: 1024 +; GCN-NEXT: After Inverting, SchedGroup Mask: 1022 +entry: + %arrayidx = getelementptr inbounds float, ptr addrspace(3) %ind, i64 0 + %1 = load float, ptr addrspace(3) %arrayidx, align 4 + call void @llvm.amdgcn.sched.barrier(i32 1024) #1 + %add = fadd contract float %1, 1.000000e+00 + %arrayidx3 = getelementptr inbounds float, ptr addrspace(3) %outd, i64 0 + store float %add, ptr addrspace(3) %arrayidx3, align 4 + ret void +} + +define protected amdgpu_kernel void @sched_barrier_m3(ptr addrspace(3) noalias %ind, ptr addrspace(3) noalias %outd) #0 { +; +; Set mask to 3 (ALU + VALU Bits) +; +; GCN: Applying IGroupLPDAGMutation... +; GCN-NEXT: Building SchedGroup for SchedBarrier with Mask: 3 +; GCN-NEXT: After Inverting, SchedGroup Mask: 2044 +entry: + %arrayidx = getelementptr inbounds float, ptr addrspace(3) %ind, i64 0 + %1 = load float, ptr addrspace(3) %arrayidx, align 4 + call void @llvm.amdgcn.sched.barrier(i32 3) #1 + %add = fadd contract float %1, 1.000000e+00 + %arrayidx3 = getelementptr inbounds float, ptr addrspace(3) %outd, i64 0 + store float %add, ptr addrspace(3) %arrayidx3, align 4 + ret void +} + +define protected amdgpu_kernel void @sched_barrier_m5(ptr addrspace(3) noalias %ind, ptr addrspace(3) noalias %outd) #0 { +; +; Set mask to 5 (ALU + SALU Bits) +; +; GCN: Applying IGroupLPDAGMutation... +; GCN-NEXT: Building SchedGroup for SchedBarrier with Mask: 5 +; GCN-NEXT: After Inverting, SchedGroup Mask: 2042 +entry: + %arrayidx = getelementptr inbounds float, ptr addrspace(3) %ind, i64 0 + %1 = load float, ptr addrspace(3) %arrayidx, align 4 + call void @llvm.amdgcn.sched.barrier(i32 5) #1 + %add = fadd contract float %1, 1.000000e+00 + %arrayidx3 = getelementptr inbounds float, ptr addrspace(3) %outd, i64 0 + store float %add, ptr addrspace(3) %arrayidx3, align 4 + ret void +} + +define protected amdgpu_kernel void @sched_barrier_m7(ptr addrspace(3) noalias %ind, ptr addrspace(3) noalias %outd) #0 { +; +; Set mask to 7 (ALU + VALU + SALU Bits) +; +; GCN: Applying IGroupLPDAGMutation... +; GCN-NEXT: Building SchedGroup for SchedBarrier with Mask: 7 +; GCN-NEXT: After Inverting, SchedGroup Mask: 2040 +entry: + %arrayidx = getelementptr inbounds float, ptr addrspace(3) %ind, i64 0 + %1 = load float, ptr addrspace(3) %arrayidx, align 4 + call void @llvm.amdgcn.sched.barrier(i32 7) #1 + %add = fadd contract float %1, 1.000000e+00 + %arrayidx3 = getelementptr inbounds float, ptr addrspace(3) %outd, i64 0 + store float %add, ptr addrspace(3) %arrayidx3, align 4 + ret void +} + +define protected amdgpu_kernel void @sched_barrier_m15(ptr addrspace(3) noalias %ind, ptr addrspace(3) noalias %outd) #0 { +; +; Set mask to 15 (ALU + VALU + SALU + MFMA Bits) +; +; GCN: Applying IGroupLPDAGMutation... +; GCN-NEXT: Building SchedGroup for SchedBarrier with Mask: 15 +; GCN-NEXT: After Inverting, SchedGroup Mask: 2032 +entry: + %arrayidx = getelementptr inbounds float, ptr addrspace(3) %ind, i64 0 + %1 = load float, ptr addrspace(3) %arrayidx, align 4 + call void @llvm.amdgcn.sched.barrier(i32 15) #1 + %add = fadd contract float %1, 1.000000e+00 + %arrayidx3 = getelementptr inbounds float, ptr addrspace(3) %outd, i64 0 + store float %add, ptr addrspace(3) %arrayidx3, align 4 + ret void +} + +declare void @llvm.amdgcn.sched.barrier(i32) #1 + +attributes #0 = { nounwind } +attributes #1 = { convergent nounwind }