[XPU][Membar] Define basic Intel-specific Membar filter (#2640)

victor-eds · web-flow · commit 057d82cad8fe · 2024-11-12T09:07:36.000Z
Define Intel-specific `Membar` filter to reduce synchronization overhead
in the
Intel backend.

`Membar` analysis will insert barriers to avoid race conditions between
operations
accessing the same memory. The granularity of this analysis is a bit
coarse, tho,
and unneeded barriers may be inserted. In order to avoid this, the
analysis
allows users to pass a callback function to filter safe cases.

As one of the recurring cases of barriers being inserted when not
needed, detect
back to back layout conversions implemented as sub-group transposes as
safe so no
barriers are inserted.

See code for further details on why this is safe.

---------

Signed-off-by: victor-eds &lt;victor.perez@codeplay.com&gt;
diff --git a/test/Conversion/intel/sub-group-transpose.mlir b/test/Conversion/intel/sub-group-transpose.mlir
@@ -426,3 +426,20 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
     tt.return %0 : tensor<32x64xf32, #blocked1>
   }
 }
+
+// -----
+
+// Test no barriers are inserted when back to back transpositions are performed.
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [2, 2], order = [0, 1]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 16], threadsPerWarp = [16, 1], warpsPerCTA = [2, 2], order = [0, 1]}>
+
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
+  // CHECK-LABEL: llvm.func spir_kernelcc @test_back_to_back
+  // CHECK-NOT: barrier
+  tt.func @test_back_to_back(%arg0: tensor<32x64xf32, #blocked>, %arg1: tensor<32x64xf32, #blocked>) -> (tensor<32x64xf32, #blocked1>, tensor<32x64xf32, #blocked1>) {
+    %0 = triton_gpu.convert_layout %arg0 : tensor<32x64xf32, #blocked> -> tensor<32x64xf32, #blocked1>
+    %1 = triton_gpu.convert_layout %arg1 : tensor<32x64xf32, #blocked> -> tensor<32x64xf32, #blocked1>
+    tt.return %0, %1 : tensor<32x64xf32, #blocked1>, tensor<32x64xf32, #blocked1>
+  }
+}
diff --git a/third_party/intel/include/Analysis/Membar.h b/third_party/intel/include/Analysis/Membar.h
@@ -0,0 +1,17 @@
+#ifndef TRITON_INTEL_ANALYSIS_MEMBAR_H
+#define TRITON_INTEL_ANALYSIS_MEMBAR_H
+
+namespace mlir {
+class Operation;
+namespace intel {
+/// Intel-specific callback to filter operations that need no barriers between
+/// each other.
+///
+/// This is useful as the granularity to check whether barriers are needed is
+/// quite coarse. The filter will return true if no barrier is needed between
+/// `lhsOp` and `rhsOp`.
+bool membarFilter(Operation *lhsOp, Operation *rhsOp);
+} // namespace intel
+} // namespace mlir
+
+#endif // TRITON_INTEL_ANALYSIS_MEMBAR_H
diff --git a/third_party/intel/lib/Analysis/CMakeLists.txt b/third_party/intel/lib/Analysis/CMakeLists.txt
@@ -3,6 +3,7 @@ add_triton_library(TritonIntelAnalysis
     AxisInfo.cpp
     DPAS.cpp
     Liveness.cpp
+    Membar.cpp
     Utility.cpp
 
     DEPENDS
diff --git a/third_party/intel/lib/Analysis/Membar.cpp b/third_party/intel/lib/Analysis/Membar.cpp
@@ -0,0 +1,58 @@
+#include "intel/include/Analysis/Membar.h"
+
+#include "intel/include/Analysis/Utility.h"
+
+namespace mlir::intel {
+namespace {
+triton::gpu::ConvertLayoutOp dynCastToSubGroupTranspose(Operation *op) {
+  auto convertLayout = dyn_cast<triton::gpu::ConvertLayoutOp>(op);
+  if (!convertLayout)
+    return nullptr;
+
+  if (!triton::gpu::intel::cvtIsSubGroupTranspose(
+          convertLayout.getSrc().getType(),
+          convertLayout.getResult().getType()))
+    return nullptr;
+
+  return convertLayout;
+}
+
+/// Check if `lhsOp` and `rhsOp` are safe to execute back-to-back sub-group
+/// transpose layout conversions.
+///
+/// Sub-group transposes are implemented as follows:
+///
+/// - Each sub-group writes all the elements it is handling in a memory block
+/// - Each sub-group reads all the elements it is handling from the same memory
+/// region.
+///
+/// As there is no need to synchronize work-items in the same sub-group and we
+/// know data won't be shared between sub-groups, executing these operations
+/// back-to-back with no barriers in between is safe.
+bool areSafeToOverlapSubGroupTransposeOps(Operation *lhsOp, Operation *rhsOp) {
+  // Check both are lowered to sub-group transpose operations.
+  auto lhsTranspose = dynCastToSubGroupTranspose(lhsOp);
+  if (!lhsTranspose)
+    return false;
+  auto rhsTranspose = dynCastToSubGroupTranspose(rhsOp);
+  if (!rhsTranspose)
+    return false;
+
+  // Check the types of source and result are the same, i.e., we are expressing
+  // the same kind of transposition.
+  if (lhsTranspose.getSrc().getType() != lhsTranspose.getSrc().getType() ||
+      lhsTranspose.getResult().getType() != lhsTranspose.getResult().getType())
+    return false;
+
+  // Check both have the same offset and thus these operation can be overlapped.
+  return lhsTranspose->getAttr("allocation.offset") ==
+         rhsTranspose->getAttr("allocation.offset");
+}
+} // namespace
+bool membarFilter(Operation *lhsOp, Operation *rhsOp) {
+  // For now, we only check these aren't layout conversions implemented as the
+  // same sub-group transposition.
+  assert(lhsOp && rhsOp && "Expecting valid operations");
+  return areSafeToOverlapSubGroupTransposeOps(lhsOp, rhsOp);
+}
+} // namespace mlir::intel
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/TritonGPUToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/TritonGPUToLLVM.cpp
@@ -15,6 +15,7 @@
 #include "intel/include/TritonIntelGPUToLLVM/Passes.h"
 
 #include "intel/include/Analysis/Allocation.h"
+#include "intel/include/Analysis/Membar.h"
 #include "triton/Analysis/AxisInfo.h"
 #include "triton/Analysis/Membar.h"
 #include "triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h"
@@ -77,7 +78,8 @@ struct ConvertTritonGPUToLLVM
     MLIRContext *context = &getContext();
     ModuleOp mod = getOperation();
 
-    intel::TritonGPUToLLVMPipelineManager pipelineManager(mod, context);
+    mlir::triton::intel::TritonGPUToLLVMPipelineManager pipelineManager(
+        mod, context);
     mlir::LowerToLLVMOptions option(context);
     bool isAdvancedPathEnabled =
         mod->hasAttr(triton::gpu::intel::TritonIntelGPUDialect::
@@ -97,7 +99,7 @@ struct ConvertTritonGPUToLLVM
     if (!pipelineManager.skipSharedMemoryAllocation()) {
       ModuleAllocation allocation =
           ModuleAllocation::get<triton::intel::AllocationAnalysis>(mod);
-      ModuleMembarAnalysis membarPass(&allocation);
+      ModuleMembarAnalysis membarPass(&allocation, ::mlir::intel::membarFilter);
       membarPass.run();
     }
 
@@ -116,7 +118,7 @@ struct ConvertTritonGPUToLLVM
         return signalPassFailure();
     }
 
-    intel::ModuleAxisInfoAnalysis axisInfoAnalysis(mod);
+    mlir::triton::intel::ModuleAxisInfoAnalysis axisInfoAnalysis(mod);
     OpBuilder::InsertPoint indexInsertPoint;
 
     RewritePatternSet patterns(context);