Skip to content

Commit 057d82c

Browse files
authored
[XPU][Membar] Define basic Intel-specific Membar filter (#2640)
Define Intel-specific `Membar` filter to reduce synchronization overhead in the Intel backend. `Membar` analysis will insert barriers to avoid race conditions between operations accessing the same memory. The granularity of this analysis is a bit coarse, tho, and unneeded barriers may be inserted. In order to avoid this, the analysis allows users to pass a callback function to filter safe cases. As one of the recurring cases of barriers being inserted when not needed, detect back to back layout conversions implemented as sub-group transposes as safe so no barriers are inserted. See code for further details on why this is safe. --------- Signed-off-by: victor-eds <[email protected]>
1 parent 16f5738 commit 057d82c

File tree

5 files changed

+98
-3
lines changed

5 files changed

+98
-3
lines changed

test/Conversion/intel/sub-group-transpose.mlir

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -426,3 +426,20 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
426426
tt.return %0 : tensor<32x64xf32, #blocked1>
427427
}
428428
}
429+
430+
// -----
431+
432+
// Test no barriers are inserted when back to back transpositions are performed.
433+
434+
#blocked = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [2, 2], order = [0, 1]}>
435+
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 16], threadsPerWarp = [16, 1], warpsPerCTA = [2, 2], order = [0, 1]}>
436+
437+
module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
438+
// CHECK-LABEL: llvm.func spir_kernelcc @test_back_to_back
439+
// CHECK-NOT: barrier
440+
tt.func @test_back_to_back(%arg0: tensor<32x64xf32, #blocked>, %arg1: tensor<32x64xf32, #blocked>) -> (tensor<32x64xf32, #blocked1>, tensor<32x64xf32, #blocked1>) {
441+
%0 = triton_gpu.convert_layout %arg0 : tensor<32x64xf32, #blocked> -> tensor<32x64xf32, #blocked1>
442+
%1 = triton_gpu.convert_layout %arg1 : tensor<32x64xf32, #blocked> -> tensor<32x64xf32, #blocked1>
443+
tt.return %0, %1 : tensor<32x64xf32, #blocked1>, tensor<32x64xf32, #blocked1>
444+
}
445+
}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#ifndef TRITON_INTEL_ANALYSIS_MEMBAR_H
2+
#define TRITON_INTEL_ANALYSIS_MEMBAR_H
3+
4+
namespace mlir {
5+
class Operation;
6+
namespace intel {
7+
/// Intel-specific callback to filter operations that need no barriers between
8+
/// each other.
9+
///
10+
/// This is useful as the granularity to check whether barriers are needed is
11+
/// quite coarse. The filter will return true if no barrier is needed between
12+
/// `lhsOp` and `rhsOp`.
13+
bool membarFilter(Operation *lhsOp, Operation *rhsOp);
14+
} // namespace intel
15+
} // namespace mlir
16+
17+
#endif // TRITON_INTEL_ANALYSIS_MEMBAR_H

third_party/intel/lib/Analysis/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ add_triton_library(TritonIntelAnalysis
33
AxisInfo.cpp
44
DPAS.cpp
55
Liveness.cpp
6+
Membar.cpp
67
Utility.cpp
78

89
DEPENDS
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
#include "intel/include/Analysis/Membar.h"
2+
3+
#include "intel/include/Analysis/Utility.h"
4+
5+
namespace mlir::intel {
6+
namespace {
7+
triton::gpu::ConvertLayoutOp dynCastToSubGroupTranspose(Operation *op) {
8+
auto convertLayout = dyn_cast<triton::gpu::ConvertLayoutOp>(op);
9+
if (!convertLayout)
10+
return nullptr;
11+
12+
if (!triton::gpu::intel::cvtIsSubGroupTranspose(
13+
convertLayout.getSrc().getType(),
14+
convertLayout.getResult().getType()))
15+
return nullptr;
16+
17+
return convertLayout;
18+
}
19+
20+
/// Check if `lhsOp` and `rhsOp` are safe to execute back-to-back sub-group
21+
/// transpose layout conversions.
22+
///
23+
/// Sub-group transposes are implemented as follows:
24+
///
25+
/// - Each sub-group writes all the elements it is handling in a memory block
26+
/// - Each sub-group reads all the elements it is handling from the same memory
27+
/// region.
28+
///
29+
/// As there is no need to synchronize work-items in the same sub-group and we
30+
/// know data won't be shared between sub-groups, executing these operations
31+
/// back-to-back with no barriers in between is safe.
32+
bool areSafeToOverlapSubGroupTransposeOps(Operation *lhsOp, Operation *rhsOp) {
33+
// Check both are lowered to sub-group transpose operations.
34+
auto lhsTranspose = dynCastToSubGroupTranspose(lhsOp);
35+
if (!lhsTranspose)
36+
return false;
37+
auto rhsTranspose = dynCastToSubGroupTranspose(rhsOp);
38+
if (!rhsTranspose)
39+
return false;
40+
41+
// Check the types of source and result are the same, i.e., we are expressing
42+
// the same kind of transposition.
43+
if (lhsTranspose.getSrc().getType() != lhsTranspose.getSrc().getType() ||
44+
lhsTranspose.getResult().getType() != lhsTranspose.getResult().getType())
45+
return false;
46+
47+
// Check both have the same offset and thus these operation can be overlapped.
48+
return lhsTranspose->getAttr("allocation.offset") ==
49+
rhsTranspose->getAttr("allocation.offset");
50+
}
51+
} // namespace
52+
bool membarFilter(Operation *lhsOp, Operation *rhsOp) {
53+
// For now, we only check these aren't layout conversions implemented as the
54+
// same sub-group transposition.
55+
assert(lhsOp && rhsOp && "Expecting valid operations");
56+
return areSafeToOverlapSubGroupTransposeOps(lhsOp, rhsOp);
57+
}
58+
} // namespace mlir::intel

third_party/intel/lib/TritonIntelGPUToLLVM/TritonGPUToLLVM.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include "intel/include/TritonIntelGPUToLLVM/Passes.h"
1616

1717
#include "intel/include/Analysis/Allocation.h"
18+
#include "intel/include/Analysis/Membar.h"
1819
#include "triton/Analysis/AxisInfo.h"
1920
#include "triton/Analysis/Membar.h"
2021
#include "triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h"
@@ -77,7 +78,8 @@ struct ConvertTritonGPUToLLVM
7778
MLIRContext *context = &getContext();
7879
ModuleOp mod = getOperation();
7980

80-
intel::TritonGPUToLLVMPipelineManager pipelineManager(mod, context);
81+
mlir::triton::intel::TritonGPUToLLVMPipelineManager pipelineManager(
82+
mod, context);
8183
mlir::LowerToLLVMOptions option(context);
8284
bool isAdvancedPathEnabled =
8385
mod->hasAttr(triton::gpu::intel::TritonIntelGPUDialect::
@@ -97,7 +99,7 @@ struct ConvertTritonGPUToLLVM
9799
if (!pipelineManager.skipSharedMemoryAllocation()) {
98100
ModuleAllocation allocation =
99101
ModuleAllocation::get<triton::intel::AllocationAnalysis>(mod);
100-
ModuleMembarAnalysis membarPass(&allocation);
102+
ModuleMembarAnalysis membarPass(&allocation, ::mlir::intel::membarFilter);
101103
membarPass.run();
102104
}
103105

@@ -116,7 +118,7 @@ struct ConvertTritonGPUToLLVM
116118
return signalPassFailure();
117119
}
118120

119-
intel::ModuleAxisInfoAnalysis axisInfoAnalysis(mod);
121+
mlir::triton::intel::ModuleAxisInfoAnalysis axisInfoAnalysis(mod);
120122
OpBuilder::InsertPoint indexInsertPoint;
121123

122124
RewritePatternSet patterns(context);

0 commit comments

Comments
 (0)