Reintroduce TritonGEN::SplitBarrier[Arrive|Wait]Op and add its lowering to SPIRV dialect (#4523)

whitneywhtsang · web-flow · commit 4b882ad5d705 · 2025-06-18T10:00:56.000-04:00
This patch reverts the previous removal of GEN split barrier operations
to avoid SPIRV-specific operations in software pipeliner transformation.

Signed-off-by: Whitney Tsang &lt;whitney.tsang@intel.com&gt;
diff --git a/test/TritonGEN/tritongen-to-spirv.mlir b/test/TritonGEN/tritongen-to-spirv.mlir
@@ -15,3 +15,21 @@ llvm.func @triton_gen.barrier.global() {
   triton_gen.barrier { mem_fence = Global }
   llvm.return
 }
+
+// -----
+
+llvm.func @triton_gen.split_barrier_arrive() {
+  // CHECK-LABEL: triton_gen.split_barrier_arrive() {
+  // CHECK:       spirv.INTEL.ControlBarrierArrive <Workgroup> <Workgroup> <None>
+  triton_gen.split_barrier_arrive {execution_scope=WorkGroup, memory_scope=WorkGroup}
+  llvm.return
+}
+
+// -----
+
+llvm.func @triton_gen.split_barrier_wait() {
+  // CHECK-LABEL: triton_gen.split_barrier_wait() {
+  // CHECK:       spirv.INTEL.ControlBarrierWait <Workgroup> <Workgroup> <None>
+  triton_gen.split_barrier_wait {execution_scope=WorkGroup, memory_scope=WorkGroup}
+  llvm.return
+}
diff --git a/test/TritonGEN/tritongen.mlir b/test/TritonGEN/tritongen.mlir
@@ -7,6 +7,20 @@ llvm.func @triton_gen.barrier() {
   llvm.return
 }
 
+llvm.func @triton_gen.split_barrier_arrive() {
+  // CHECK-LABEL: triton_gen.split_barrier_arrive
+  // CHECK: triton_gen.split_barrier_arrive {execution_scope = WorkGroup, memory_scope = WorkGroup}
+  triton_gen.split_barrier_arrive {execution_scope=WorkGroup, memory_scope=WorkGroup}
+  llvm.return
+}
+
+llvm.func @triton_gen.split_barrier_wait() {
+  // CHECK-LABEL: triton_gen.split_barrier_wait
+  // CHECK: triton_gen.split_barrier_wait {execution_scope = WorkGroup, memory_scope = WorkGroup}
+  triton_gen.split_barrier_wait {execution_scope=WorkGroup, memory_scope=WorkGroup}
+  llvm.return
+}
+
 llvm.func @triton_gen.dpas(%c : vector<8xi32>, %a : vector<8xi16>, %b : vector<8xi32>) {
   // CHECK:      llvm.func @triton_gen.dpas(%arg0: vector<8xi32>, %arg1: vector<8xi16>, %arg2: vector<8xi32>) {
   // CHECK-NEXT:   %0 = triton_gen.dpas %arg0, %arg1, %arg2 {pa = i8, pb = i8, rc = 8} : (vector<8xi32>, vector<8xi16>, vector<8xi32>) -> vector<8xi32>
diff --git a/test/TritonIntelGPU/split-barrier.mlir b/test/TritonIntelGPU/split-barrier.mlir
@@ -23,13 +23,13 @@ module attributes {"ttg.num-warps" = 32 : i32, "ttg.threads-per-warp" = 16 : i32
     // CHECK:      ttig.prefetch {{.*}} : !tt.ptr<tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #[[$DPAS]], kWidth = 1}>>>
     // CHECK-NEXT: ttig.prefetch {{.*}} : !tt.ptr<tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #[[$DPAS]], kWidth = 2}>>>
     // CHECK:      scf.for %[[IV:.*]] = {{.*}} to {{.*}} step {{.*}} iter_args({{.*}}) -> (tensor<128x256xf32, #mma>, !tt.ptr<tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>>, !tt.ptr<tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>>, !tt.ptr<tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>>, !tt.ptr<tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>>)
-    // WORKGROUP_SCOPE-NEXT: spirv.INTEL.ControlBarrierArrive <Workgroup> <Workgroup> <None>
-    // SUBGROUP_SCOPE-NEXT: spirv.INTEL.ControlBarrierArrive <Subgroup> <Subgroup> <None>
+    // WORKGROUP_SCOPE-NEXT: triton_gen.split_barrier_arrive {execution_scope = WorkGroup, memory_scope = WorkGroup}
+    // SUBGROUP_SCOPE-NEXT: triton_gen.split_barrier_arrive {execution_scope = SubGroup, memory_scope = SubGroup}
     // CHECK:        ttig.prefetch {{.*}} : !tt.ptr<tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #[[$DPAS]], kWidth = 1}>>>
     // CHECK:        ttig.prefetch {{.*}} : !tt.ptr<tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #[[$DPAS]], kWidth = 2}>>
     // CHECK:        tt.dot {{.*}} : tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #[[$DPAS]], kWidth = 1}>> * tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #[[$DPAS]], kWidth = 2}>> -> tensor<128x256xf32, #[[$DPAS]]>
-    // WORKGROUP_SCOPE:   spirv.INTEL.ControlBarrierWait <Workgroup> <Workgroup> <None>
-    // SUBGROUP_SCOPE:   spirv.INTEL.ControlBarrierWait <Subgroup> <Subgroup> <None>
+    // WORKGROUP_SCOPE:   triton_gen.split_barrier_wait {execution_scope = WorkGroup, memory_scope = WorkGroup}
+    // SUBGROUP_SCOPE:   triton_gen.split_barrier_wait {execution_scope = SubGroup, memory_scope = SubGroup}
     // CHECK-NEXT:   scf.yield
     %23:3 = scf.for %arg2 = %c0_i32 to %c64_i32 step %c64_i32 iter_args(%arg3 = %cst, %arg4 = %18, %arg5 = %22) -> (tensor<128x256xf32, #dpas>, !tt.ptr<tensor<128x64xf16, #dot0>>, !tt.ptr<tensor<64x256xf16, #dot1>>)  : i32 {
       %55:3 = scf.for %arg9 = %c0_i32 to %c64_i32 step %c64_i32 iter_args(%arg10 = %cst, %arg11 = %18, %arg12 = %22) -> (tensor<128x256xf32, #dpas>, !tt.ptr<tensor<128x64xf16, #dot0>>, !tt.ptr<tensor<64x256xf16, #dot1>>)  : i32 {
@@ -70,13 +70,13 @@ module attributes {"ttg.num-warps" = 32 : i32, "ttg.threads-per-warp" = 16 : i32
     // CHECK:      ttig.prefetch {{.*}} : !tt.ptr<tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #[[$DPAS]], kWidth = 1}>>>
     // CHECK-NEXT: ttig.prefetch {{.*}} : !tt.ptr<tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #[[$DPAS]], kWidth = 2}>>>
     // CHECK:      scf.for %[[IV:.*]] = {{.*}} to {{.*}} step {{.*}} iter_args({{.*}}) -> (tensor<128x256xf32, #mma>, !tt.ptr<tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>>, !tt.ptr<tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>>, !tt.ptr<tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>>, !tt.ptr<tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>>)
-    // WORKGROUP_SCOPE-NEXT: spirv.INTEL.ControlBarrierArrive <Workgroup> <Workgroup> <None>
-    // SUBGROUP_SCOPE-NEXT: spirv.INTEL.ControlBarrierArrive <Subgroup> <Subgroup> <None>
+    // WORKGROUP_SCOPE-NEXT: triton_gen.split_barrier_arrive {execution_scope = WorkGroup, memory_scope = WorkGroup}
+    // SUBGROUP_SCOPE-NEXT: triton_gen.split_barrier_arrive {execution_scope = SubGroup, memory_scope = SubGroup}
     // CHECK:        ttig.prefetch {{.*}} : !tt.ptr<tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #[[$DPAS]], kWidth = 1}>>>
     // CHECK:        ttig.prefetch {{.*}} : !tt.ptr<tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #[[$DPAS]], kWidth = 2}>>
     // CHECK:        tt.dot {{.*}} : tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #[[$DPAS]], kWidth = 1}>> * tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #[[$DPAS]], kWidth = 2}>> -> tensor<128x256xf32, #[[$DPAS]]>
-    // WORKGROUP_SCOPE:   spirv.INTEL.ControlBarrierWait <Workgroup> <Workgroup> <None>
-    // SUBGROUP_SCOPE:   spirv.INTEL.ControlBarrierWait <Subgroup> <Subgroup> <None>
+    // WORKGROUP_SCOPE:   triton_gen.split_barrier_wait {execution_scope = WorkGroup, memory_scope = WorkGroup}
+    // SUBGROUP_SCOPE:   triton_gen.split_barrier_wait {execution_scope = SubGroup, memory_scope = SubGroup}
     // CHECK-NEXT:   scf.yield
     %23:3 = scf.for %arg9 = %c0_i32 to %c64_i32 step %c64_i32 iter_args(%arg10 = %cst, %arg11 = %18, %arg12 = %22) -> (tensor<128x256xf32, #dpas>, !tt.ptr<tensor<128x64xf16, #dot0>>, !tt.ptr<tensor<64x256xf16, #dot1>>)  : i32 {
       %56 = tt.load %arg11 {boundaryCheck = array<i32: 0, 1>, ttig.block_io = "row_major"} : !tt.ptr<tensor<128x64xf16, #dot0>>
diff --git a/third_party/intel/include/Dialect/TritonGEN/IR/TritonGENOps.td b/third_party/intel/include/Dialect/TritonGEN/IR/TritonGENOps.td
@@ -47,6 +47,59 @@ def TritonGEN_BarrierOp : TritonGEN_Op<"barrier"> {
   }];
 }
 
+def TritonGEN_SplitBarrierArriveOp : TritonGEN_Op<"split_barrier_arrive"> {
+  let summary = "Split barrier signal";
+  let description = [{
+    Indicates that an invocation has arrived at a split control barrier. This
+    may allow other invocations waiting on the split control barrier to continue
+    executing.
+
+    When `Execution` is `Workgroup` or larger, behavior is undefined unless all
+    invocations within `Execution` execute the same dynamic instance of this
+    instruction. When `Execution` is `Subgroup` or `Invocation`, the behavior of
+    this instruction in non-uniform control flow is defined by the client API.
+
+    If `Semantics` is not `None`, this instruction also serves as the start of a
+    memory barrier similar to an `OpMemoryBarrier` instruction with the same
+    `Memory` and `Semantics` operands. This allows atomically specifying both a
+    control barrier and a memory barrier (that is, without needing two
+    instructions). If `Semantics` is `None`, `Memory` is ignored.
+  }];
+  let arguments = (ins TritonGEN_MemScope:$execution_scope, TritonGEN_MemScope:$memory_scope);
+  let results = (outs);
+  let assemblyFormat = [{
+    ` ` `{` `execution_scope` `=` $execution_scope `,` `memory_scope` `=` $memory_scope `}` attr-dict
+  }];
+}
+
+def TritonGEN_SplitBarrierWaitOp : TritonGEN_Op<"split_barrier_wait"> {
+  let summary = "Split barrier wait";
+  let description = [{
+    Waits for other invocations of this module to arrive at a split control
+    barrier.
+
+    When `Execution` is `Workgroup` or larger, behavior is undefined unless all
+    invocations within `Execution` execute the same dynamic instance of this
+    instruction. When `Execution` is `Subgroup` or `Invocation`, the behavior of
+    this instruction in non-uniform control flow is defined by the client API.
+
+    If `Semantics` is not `None`, this instruction also serves as the end of a
+    memory barrier similar to an `OpMemoryBarrier` instruction with the same
+    `Memory` and `Semantics` operands. This ensures that memory accesses issued
+    before arriving at the split barrier are observed before memory accesses
+    issued after this instruction. This control is ensured only for memory
+    accesses issued by this invocation and observed by another invocation
+    executing within `Memory` scope. This allows atomically specifying both a
+    control barrier and a memory barrier (that is, without needing two
+    instructions). If `Semantics` is `None`, `Memory` is ignored.
+  }];
+  let arguments = (ins TritonGEN_MemScope:$execution_scope, TritonGEN_MemScope:$memory_scope);
+  let results = (outs);
+  let assemblyFormat = [{
+    ` ` `{` `execution_scope` `=` $execution_scope `,` `memory_scope` `=` $memory_scope `}` attr-dict
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // Matrix operations
 //===----------------------------------------------------------------------===//
diff --git a/third_party/intel/lib/TritonGENToSPIRV/TritonGENToSPIRVPass.cpp b/third_party/intel/lib/TritonGENToSPIRV/TritonGENToSPIRVPass.cpp
@@ -24,6 +24,17 @@ using namespace mlir::triton;
 
 namespace {
 
+static spirv::Scope getSpirvScope(TritonGEN::MemScope scope) {
+  switch (scope) {
+  case TritonGEN::MemScope::WORK_GROUP:
+    return spirv::Scope::Workgroup;
+  case TritonGEN::MemScope::SUB_GROUP:
+    return spirv::Scope::Subgroup;
+  default:
+    llvm_unreachable("unexpected MemScope value");
+  }
+}
+
 struct TritonGENBarrierLowering
     : public OpConversionPattern<TritonGEN::BarrierOp> {
   using OpConversionPattern<TritonGEN::BarrierOp>::OpConversionPattern;
@@ -57,6 +68,35 @@ struct TritonGENBarrierLowering
   }
 };
 
+struct TritonGENSplitBarrierArriveLowering
+    : public OpConversionPattern<TritonGEN::SplitBarrierArriveOp> {
+  using OpConversionPattern<
+      TritonGEN::SplitBarrierArriveOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(TritonGEN::SplitBarrierArriveOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    rewriter.replaceOpWithNewOp<spirv::INTELControlBarrierArriveOp>(
+        op, getSpirvScope(op.getExecutionScope()),
+        getSpirvScope(op.getMemoryScope()), spirv::MemorySemantics::None);
+    return success();
+  }
+};
+
+struct TritonGENSplitBarrierWaitLowering
+    : public OpConversionPattern<TritonGEN::SplitBarrierWaitOp> {
+  using OpConversionPattern<TritonGEN::SplitBarrierWaitOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(TritonGEN::SplitBarrierWaitOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    rewriter.replaceOpWithNewOp<spirv::INTELControlBarrierWaitOp>(
+        op, getSpirvScope(op.getExecutionScope()),
+        getSpirvScope(op.getMemoryScope()), spirv::MemorySemantics::None);
+    return success();
+  }
+};
+
 } // namespace
 
 //===----------------------------------------------------------------------===//
@@ -100,5 +140,6 @@ struct ConvertTritonGENToSPIRV
 
 void mlir::triton::populateTritonGENToSPIRVConversionPatterns(
     RewritePatternSet &patterns) {
-  patterns.add<TritonGENBarrierLowering>(patterns.getContext());
+  patterns.add<TritonGENBarrierLowering, TritonGENSplitBarrierArriveLowering,
+               TritonGENSplitBarrierWaitLowering>(patterns.getContext());
 }
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/SoftwarePipeliner.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/SoftwarePipeliner.cpp
@@ -38,9 +38,9 @@ static bool preCondition(scf::ForOp forOp) {
   return true;
 }
 
-static void
-pipelineLoop(scf::ForOp forOp, int numStages,
-             std::optional<spirv::Scope> barrierScope = std::nullopt) {
+static void pipelineLoop(
+    scf::ForOp forOp, int numStages,
+    std::optional<triton::TritonGEN::MemScope> barrierScope = std::nullopt) {
   mlir::scf::PipeliningOption options;
   if (!preCondition(forOp))
     return;
@@ -60,18 +60,18 @@ pipelineLoop(scf::ForOp forOp, int numStages,
 
   scf::ForOp loop = (*newForOp);
   if (barrierScope) {
-    assert((*barrierScope == spirv::Scope::Subgroup) ||
-           (*barrierScope == spirv::Scope::Workgroup) &&
+    assert((*barrierScope == triton::TritonGEN::MemScope::SUB_GROUP) ||
+           (*barrierScope == triton::TritonGEN::MemScope::WORK_GROUP) &&
                "The barrier scope must be SubGroup or Workgroup");
     OpBuilder b(loop);
     Location loc = loop.getLoc();
     b.setInsertionPointToStart(loop.getBody());
-    b.create<spirv::INTELControlBarrierArriveOp>(
-        loc, *barrierScope, *barrierScope, spirv::MemorySemantics::None);
+    b.create<triton::TritonGEN::SplitBarrierArriveOp>(loc, *barrierScope,
+                                                      *barrierScope);
     auto yield = cast<scf::YieldOp>(loop.getBody()->getTerminator());
     b.setInsertionPoint(yield);
-    b.create<spirv::INTELControlBarrierWaitOp>(
-        loc, *barrierScope, *barrierScope, spirv::MemorySemantics::None);
+    b.create<triton::TritonGEN::SplitBarrierWaitOp>(loc, *barrierScope,
+                                                    *barrierScope);
   }
 }
 
@@ -92,15 +92,15 @@ struct IntelGPUPipelinePass
     if (numStages <= 1)
       return;
 
-    std::optional<spirv::Scope> barrierScope = std::nullopt;
+    std::optional<triton::TritonGEN::MemScope> barrierScope = std::nullopt;
     switch (splitBarrierScope) {
     case ttgi::SplitBarrierScope::None:
       break;
     case ttgi::SplitBarrierScope::Workgroup:
-      barrierScope = spirv::Scope::Workgroup;
+      barrierScope = triton::TritonGEN::MemScope::WORK_GROUP;
       break;
     case ttgi::SplitBarrierScope::Subgroup:
-      barrierScope = spirv::Scope::Subgroup;
+      barrierScope = triton::TritonGEN::MemScope::SUB_GROUP;
       break;
     }