llvm · jplehr · Sep 25, 2025 · Oct 18, 2025 · Oct 20, 2025 · Oct 20, 2025
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
@@ -1385,7 +1385,8 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
 
   llvm.amdgcn.sched.barrier                        Controls the types of instructions that may be allowed to cross the intrinsic
                                                    during instruction scheduling. The parameter is a mask for the instruction types
-                                                   that can cross the intrinsic.
+                                                   that can cross the intrinsic. When bits for specific instructions are set, their
+                                                   more general version (all ALU or all VMEM) is ignored.
 
                                                    - 0x0000: No instructions may be scheduled across sched_barrier.
                                                    - 0x0001: All, non-memory, non-side-effect producing instructions may be

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -2619,8 +2619,14 @@ IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const {
   // allowed past the SCHED_BARRIER.
   SchedGroupMask InvertedMask = ~Mask;
 
+  // When given specific bits overrule the more general ALU type.
+  bool HasConcreteALUClassSpecified =
+      (Mask & (SchedGroupMask::SALU | SchedGroupMask::VALU |
+               SchedGroupMask::MFMA)) != SchedGroupMask::NONE;
+
   // ALU implies VALU, SALU, MFMA, TRANS.
-  if ((InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE)
+  if (!HasConcreteALUClassSpecified &&
+      (InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE)
     InvertedMask &= ~SchedGroupMask::VALU & ~SchedGroupMask::SALU &
                     ~SchedGroupMask::MFMA & ~SchedGroupMask::TRANS;
   // VALU, SALU, MFMA, TRANS implies ALU.
@@ -2630,8 +2636,15 @@ IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const {
            (InvertedMask & SchedGroupMask::TRANS) == SchedGroupMask::NONE)
     InvertedMask &= ~SchedGroupMask::ALU;
 
+  // When given specific bits overrule the more general MEM type.
+  bool HasConcreteMemClassSpecified =
+      (Mask & (SchedGroupMask::VMEM_READ | SchedGroupMask::VMEM_WRITE |
+               SchedGroupMask::DS_READ | SchedGroupMask::DS_WRITE)) !=
+      SchedGroupMask::NONE;
+
   // VMEM implies VMEM_READ, VMEM_WRITE.
-  if ((InvertedMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE)
+  if (!HasConcreteMemClassSpecified &&
+      (InvertedMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE)
     InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE;
   // VMEM_READ, VMEM_WRITE implies VMEM.
   else if ((InvertedMask & SchedGroupMask::VMEM_READ) == SchedGroupMask::NONE ||

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.alu-bit.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.alu-bit.ll
@@ -0,0 +1,175 @@
+; RUN: llc -mtriple=amdgcn -debug-only=igrouplp < %s 2>&1| FileCheck -check-prefix=GCN %s
+
+define protected amdgpu_kernel void @sched_barrier_m0(ptr addrspace(3) noalias %ind, ptr addrspace(3) noalias %outd) #0 {
+;
+; Set mask to 0 (no bits set)
+;
+; GCN: Applying IGroupLPDAGMutation...
+; GCN-NEXT: Applying IGroupLPDAGMutation...
+entry:
+  %arrayidx = getelementptr inbounds float, ptr addrspace(3) %ind, i64 0
+  %1 = load float, ptr addrspace(3) %arrayidx, align 4
+  call void @llvm.amdgcn.sched.barrier(i32 0) #1
+  %add = fadd contract float %1, 1.000000e+00
+  %arrayidx3 = getelementptr inbounds float, ptr addrspace(3) %outd, i64 0
+  store float %add, ptr addrspace(3) %arrayidx3, align 4
+  ret void
+}
+
+define protected amdgpu_kernel void @sched_barrier_m1(ptr addrspace(3) noalias %ind, ptr addrspace(3) noalias %outd) #0 {
+;
+; Set mask to 1 (ALU Bit, implies all *-ALU bits)
+;
+; GCN: Applying IGroupLPDAGMutation...
+; GCN-NEXT: Building SchedGroup for SchedBarrier with Mask: 1
+; GCN-NEXT: After Inverting, SchedGroup Mask: 1008
+entry:
+  %arrayidx = getelementptr inbounds float, ptr addrspace(3) %ind, i64 0
+  %1 = load float, ptr addrspace(3) %arrayidx, align 4
+  call void @llvm.amdgcn.sched.barrier(i32 1) #1
+  %add = fadd contract float %1, 1.000000e+00
+  %arrayidx3 = getelementptr inbounds float, ptr addrspace(3) %outd, i64 0
+  store float %add, ptr addrspace(3) %arrayidx3, align 4
+  ret void
+}
+
+define protected amdgpu_kernel void @sched_barrier_m2(ptr addrspace(3) noalias %ind, ptr addrspace(3) noalias %outd) #0 {
+;
+; Set mask to 2 (VALU Bit)
+;
+; GCN: Applying IGroupLPDAGMutation...
+; GCN-NEXT: Building SchedGroup for SchedBarrier with Mask: 2
+; GCN-NEXT: After Inverting, SchedGroup Mask: 2044
+entry:
+  %arrayidx = getelementptr inbounds float, ptr addrspace(3) %ind, i64 0
+  %1 = load float, ptr addrspace(3) %arrayidx, align 4
+  call void @llvm.amdgcn.sched.barrier(i32 2) #1
+  %add = fadd contract float %1, 1.000000e+00
+  %arrayidx3 = getelementptr inbounds float, ptr addrspace(3) %outd, i64 0
+  store float %add, ptr addrspace(3) %arrayidx3, align 4
+  ret void
+}
+
+define protected amdgpu_kernel void @sched_barrier_m4(ptr addrspace(3) noalias %ind, ptr addrspace(3) noalias %outd) #0 {
+;
+; Set mask to 4 (SALU Bit)
+;
+; GCN: Applying IGroupLPDAGMutation...
+; GCN-NEXT: Building SchedGroup for SchedBarrier with Mask: 4
+; GCN-NEXT: After Inverting, SchedGroup Mask: 2042
+entry:
+  %arrayidx = getelementptr inbounds float, ptr addrspace(3) %ind, i64 0
+  %1 = load float, ptr addrspace(3) %arrayidx, align 4
+  call void @llvm.amdgcn.sched.barrier(i32 4) #1
+  %add = fadd contract float %1, 1.000000e+00
+  %arrayidx3 = getelementptr inbounds float, ptr addrspace(3) %outd, i64 0
+  store float %add, ptr addrspace(3) %arrayidx3, align 4
+  ret void
+}
+
+define protected amdgpu_kernel void @sched_barrier_m8(ptr addrspace(3) noalias %ind, ptr addrspace(3) noalias %outd) #0 {
+;
+; Set mask to 8 (MFMA Bit)
+;
+; GCN: Applying IGroupLPDAGMutation...
+; GCN-NEXT: Building SchedGroup for SchedBarrier with Mask: 8
+; GCN-NEXT: After Inverting, SchedGroup Mask: 2038
+entry:
+  %arrayidx = getelementptr inbounds float, ptr addrspace(3) %ind, i64 0
+  %1 = load float, ptr addrspace(3) %arrayidx, align 4
+  call void @llvm.amdgcn.sched.barrier(i32 8) #1
+  %add = fadd contract float %1, 1.000000e+00
+  %arrayidx3 = getelementptr inbounds float, ptr addrspace(3) %outd, i64 0
+  store float %add, ptr addrspace(3) %arrayidx3, align 4
+  ret void
+}
+
+define protected amdgpu_kernel void @sched_barrier_m1024(ptr addrspace(3) noalias %ind, ptr addrspace(3) noalias %outd) #0 {
+;
+; Set mask to 1024 (TRANS Bit)
+;
+; GCN: Applying IGroupLPDAGMutation...
+; GCN-NEXT: Building SchedGroup for SchedBarrier with Mask: 1024
+; GCN-NEXT: After Inverting, SchedGroup Mask: 1022
+entry:
+  %arrayidx = getelementptr inbounds float, ptr addrspace(3) %ind, i64 0
+  %1 = load float, ptr addrspace(3) %arrayidx, align 4
+  call void @llvm.amdgcn.sched.barrier(i32 1024) #1
+  %add = fadd contract float %1, 1.000000e+00
+  %arrayidx3 = getelementptr inbounds float, ptr addrspace(3) %outd, i64 0
+  store float %add, ptr addrspace(3) %arrayidx3, align 4
+  ret void
+}
+
+define protected amdgpu_kernel void @sched_barrier_m3(ptr addrspace(3) noalias %ind, ptr addrspace(3) noalias %outd) #0 {
+;
+; Set mask to 3 (ALU + VALU Bits)
+;
+; GCN: Applying IGroupLPDAGMutation...
+; GCN-NEXT: Building SchedGroup for SchedBarrier with Mask: 3
+; GCN-NEXT: After Inverting, SchedGroup Mask: 2044
+entry:
+  %arrayidx = getelementptr inbounds float, ptr addrspace(3) %ind, i64 0
+  %1 = load float, ptr addrspace(3) %arrayidx, align 4
+  call void @llvm.amdgcn.sched.barrier(i32 3) #1
+  %add = fadd contract float %1, 1.000000e+00
+  %arrayidx3 = getelementptr inbounds float, ptr addrspace(3) %outd, i64 0
+  store float %add, ptr addrspace(3) %arrayidx3, align 4
+  ret void
+}
+
+define protected amdgpu_kernel void @sched_barrier_m5(ptr addrspace(3) noalias %ind, ptr addrspace(3) noalias %outd) #0 {
+;
+; Set mask to 5 (ALU + SALU Bits)
+;
+; GCN: Applying IGroupLPDAGMutation...
+; GCN-NEXT: Building SchedGroup for SchedBarrier with Mask: 5
+; GCN-NEXT: After Inverting, SchedGroup Mask: 2042
+entry:
+  %arrayidx = getelementptr inbounds float, ptr addrspace(3) %ind, i64 0
+  %1 = load float, ptr addrspace(3) %arrayidx, align 4
+  call void @llvm.amdgcn.sched.barrier(i32 5) #1
+  %add = fadd contract float %1, 1.000000e+00
+  %arrayidx3 = getelementptr inbounds float, ptr addrspace(3) %outd, i64 0
+  store float %add, ptr addrspace(3) %arrayidx3, align 4
+  ret void
+}
+
+define protected amdgpu_kernel void @sched_barrier_m7(ptr addrspace(3) noalias %ind, ptr addrspace(3) noalias %outd) #0 {
+;
+; Set mask to 7 (ALU + VALU + SALU Bits)
+;
+; GCN: Applying IGroupLPDAGMutation...
+; GCN-NEXT: Building SchedGroup for SchedBarrier with Mask: 7
+; GCN-NEXT: After Inverting, SchedGroup Mask: 2040
+entry:
+  %arrayidx = getelementptr inbounds float, ptr addrspace(3) %ind, i64 0
+  %1 = load float, ptr addrspace(3) %arrayidx, align 4
+  call void @llvm.amdgcn.sched.barrier(i32 7) #1
+  %add = fadd contract float %1, 1.000000e+00
+  %arrayidx3 = getelementptr inbounds float, ptr addrspace(3) %outd, i64 0
+  store float %add, ptr addrspace(3) %arrayidx3, align 4
+  ret void
+}
+
+define protected amdgpu_kernel void @sched_barrier_m15(ptr addrspace(3) noalias %ind, ptr addrspace(3) noalias %outd) #0 {
+;
+; Set mask to 15 (ALU + VALU + SALU + MFMA Bits)
+;
+; GCN: Applying IGroupLPDAGMutation...
+; GCN-NEXT: Building SchedGroup for SchedBarrier with Mask: 15
+; GCN-NEXT: After Inverting, SchedGroup Mask: 2032
+entry:
+  %arrayidx = getelementptr inbounds float, ptr addrspace(3) %ind, i64 0
+  %1 = load float, ptr addrspace(3) %arrayidx, align 4
+  call void @llvm.amdgcn.sched.barrier(i32 15) #1
+  %add = fadd contract float %1, 1.000000e+00
+  %arrayidx3 = getelementptr inbounds float, ptr addrspace(3) %outd, i64 0
+  store float %add, ptr addrspace(3) %arrayidx3, align 4
+  ret void
+}
+
+declare void @llvm.amdgcn.sched.barrier(i32) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { convergent nounwind }