intel
diff --git a/‎include/triton/Analysis/Membar.h
Lines changed: 32 additions & 14 deletions b/‎include/triton/Analysis/Membar.h
Lines changed: 32 additions & 14 deletions
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/Transforms/Passes.td
Lines changed: 22 additions & 0 deletions b/‎include/triton/Dialect/TritonNvidiaGPU/Transforms/Passes.td
Lines changed: 22 additions & 0 deletions
diff --git a/‎lib/Analysis/Membar.cpp
Lines changed: 6 additions & 6 deletions b/‎lib/Analysis/Membar.cpp
Lines changed: 6 additions & 6 deletions
diff --git a/‎lib/Dialect/TritonNvidiaGPU/Transforms/CMakeLists.txt
Lines changed: 1 addition & 0 deletions b/‎lib/Dialect/TritonNvidiaGPU/Transforms/CMakeLists.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎lib/Dialect/TritonNvidiaGPU/Transforms/ProxFenceInsertion.cpp
Lines changed: 196 additions & 0 deletions b/‎lib/Dialect/TritonNvidiaGPU/Transforms/ProxFenceInsertion.cpp
Lines changed: 196 additions & 0 deletions
@@ -95,7 +95,9 @@ struct BlockInfo {
 //===----------------------------------------------------------------------===//
 // Shared Memory Barrier Analysis
 //===----------------------------------------------------------------------===//
-class MembarAnalysis {
+
+// Common class to analyze membar and fence placement.
+class MembarOrFenceAnalysis {
   using VirtualBlock = std::pair<Block *, Block::iterator>;
 
 public:
@@ -113,15 +115,15 @@ class MembarAnalysis {
   /// a shared memory read. If the temporary storage is written but not read,
   /// it is considered as the problem of the operation itself but not the membar
   /// analysis.
-  MembarAnalysis() = default;
-  explicit MembarAnalysis(Allocation *allocation, MembarFilterFn filter)
+  MembarOrFenceAnalysis() = default;
+  explicit MembarOrFenceAnalysis(Allocation *allocation, MembarFilterFn filter)
       : allocation(allocation), filter(filter) {}
 
   /// Runs the membar analysis to the given operation, inserts a barrier if
   /// necessary.
   void run(FuncBlockInfoMapT &funcBlockInfoMap);
 
-private:
+protected:
   /// Applies the barrier analysis based on the SCF dialect, in which each
   /// region has a single basic block only.
   /// Example:
@@ -139,30 +141,44 @@ class MembarAnalysis {
   void resolve(FunctionOpInterface funcOp, FuncBlockInfoMapT *funcBlockInfoMap,
                OpBuilder *builder);
 
-  /// Updates the BlockInfo operation based on the operation.
-  void update(Operation *operation, BlockInfo *blockInfo,
-              FuncBlockInfoMapT *funcBlockInfoMap, OpBuilder *builder);
-
   /// Collects the successors of the terminator
   void visitTerminator(Operation *operation,
                        SmallVector<VirtualBlock> &successors);
 
-  void insertBarrier(Operation *operation, OpBuilder *builder);
+  /// Updates the BlockInfo operation based on the operation.
+  virtual void update(Operation *operation, BlockInfo *blockInfo,
+                      FuncBlockInfoMapT *funcBlockInfoMap,
+                      OpBuilder *builder) = 0;
 
-private:
   Allocation *allocation = nullptr;
   MembarFilterFn filter = nullptr;
 };
 
+class MembarAnalysis : public MembarOrFenceAnalysis {
+public:
+  MembarAnalysis() = default;
+  explicit MembarAnalysis(Allocation *allocation, MembarFilterFn filter)
+      : MembarOrFenceAnalysis(allocation, filter) {}
+
+private:
+  /// Updates the BlockInfo operation based on the operation.
+  virtual void update(Operation *operation, BlockInfo *blockInfo,
+                      FuncBlockInfoMapT *funcBlockInfoMap,
+                      OpBuilder *builder) override;
+
+  void insertBarrier(Operation *operation, OpBuilder *builder);
+};
+
 /// Postorder traversal on the callgraph to insert membar instructions
 /// of each function.
 /// Each function maintains a BlockInfo map that includes all potential buffers
 /// after returning. This way users do not have to explicitly insert membars
 /// before and after function calls, but might be a bit conservative.
-class ModuleMembarAnalysis : public CallGraph<BlockInfo> {
+template <typename AnalysisType>
+class ModuleMembarOrFenceAnalysis : public CallGraph<BlockInfo> {
 public:
-  ModuleMembarAnalysis(ModuleAllocation *moduleAllocation,
-                       MembarFilterFn filter = nullptr)
+  ModuleMembarOrFenceAnalysis(ModuleAllocation *moduleAllocation,
+                              MembarFilterFn filter = nullptr)
       : CallGraph<BlockInfo>(moduleAllocation->getModuleOp()),
         moduleAllocation(moduleAllocation), filter(filter) {}
 
@@ -175,7 +191,7 @@ class ModuleMembarAnalysis : public CallGraph<BlockInfo> {
           auto *allocation = moduleAllocation->getFuncData(funcOp);
           auto [it, inserted] = funcMap.try_emplace(funcOp, BlockInfo());
           if (inserted) {
-            MembarAnalysis analysis(allocation, filter);
+            AnalysisType analysis(allocation, filter);
             analysis.run(funcMap);
           }
         });
@@ -186,6 +202,8 @@ class ModuleMembarAnalysis : public CallGraph<BlockInfo> {
   MembarFilterFn filter;
 };
 
+typedef ModuleMembarOrFenceAnalysis<MembarAnalysis> ModuleMembarAnalysis;
+
 } // namespace mlir
 
 #endif // TRITON_ANALYSIS_MEMBAR_H
@@ -41,6 +41,28 @@ def TritonGPUPlanCTAPass : Pass<"triton-nvidia-gpu-plan-cta", "mlir::ModuleOp">
 }
 
 def TritonGPUFenceInsertion : Pass<"triton-nvidia-gpu-fence-insertion", "mlir::ModuleOp"> {
+  let summary = "Insert fences across generic and async proxy.";
+
+  let description = [{
+    This pass is to insert memory fences to ensure that memory operations are
+    properly ordered across generic and async operations.
+    This pass inserts fences at optimized location.
+    There is a pass later to handle all the functional requirements
+  }];
+
+  let dependentDialects = [
+    "mlir::triton::gpu::TritonGPUDialect",
+    "mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect"
+  ];
+
+  let options = [
+    Option<"computeCapability", "compute-capability",
+           "int32_t", /*default*/"90",
+           "device compute capability">
+  ];
+}
+
+def TritonGPUProxyFenceInsertion : Pass<"triton-nvidia-gpu-proxy-fence-insertion", "mlir::ModuleOp"> {
   let summary = "Insert fences across generic and async proxy";
 
   let description = [{
 
@@ -8,16 +8,16 @@
 
 namespace mlir {
 
-void MembarAnalysis::run(FuncBlockInfoMapT &funcBlockInfoMap) {
+void MembarOrFenceAnalysis::run(FuncBlockInfoMapT &funcBlockInfoMap) {
   FunctionOpInterface funcOp =
       dyn_cast<FunctionOpInterface>(allocation->getOperation());
   OpBuilder builder(funcOp.getContext());
   resolve(funcOp, &funcBlockInfoMap, &builder);
 }
 
-void MembarAnalysis::resolve(FunctionOpInterface funcOp,
-                             FuncBlockInfoMapT *funcBlockInfoMap,
-                             OpBuilder *builder) {
+void MembarOrFenceAnalysis::resolve(FunctionOpInterface funcOp,
+                                    FuncBlockInfoMapT *funcBlockInfoMap,
+                                    OpBuilder *builder) {
   // Initialize the blockList. Operations are organized into "virtual blocks",
   // which represent segments of straight-line code analyzed by each iteration
   // of the dataflow analysis. Virtual blocks abstract over both control flow
@@ -103,8 +103,8 @@ void MembarAnalysis::resolve(FunctionOpInterface funcOp,
   });
 }
 
-void MembarAnalysis::visitTerminator(Operation *op,
-                                     SmallVector<VirtualBlock> &successors) {
+void MembarOrFenceAnalysis::visitTerminator(
+    Operation *op, SmallVector<VirtualBlock> &successors) {
   if (isa<BranchOpInterface>(op)) {
     // Collect the block successors of the branch.
     for (Block *successor : op->getSuccessors())
 
@@ -6,6 +6,7 @@ add_triton_library(TritonNvidiaGPUTransforms
   OptimizeTMemLayouts.cpp
   PlanCTA.cpp
   PromoteLHSToTMem.cpp
+  ProxFenceInsertion.cpp
   RemoveTMEMTokens.cpp
   TensorMemoryAllocation.cpp
   TMALowering.cpp
 
@@ -0,0 +1,196 @@
+#include "triton/Analysis/Allocation.h"
+#include "triton/Analysis/Membar.h"
+#include "triton/Analysis/Utility.h"
+#include "triton/Dialect/TritonGPU/IR/Dialect.h"
+#include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
+#include "triton/Dialect/TritonNvidiaGPU/Transforms/Passes.h"
+
+//===----------------------------------------------------------------------===//
+//
+// On Hopper+, async proxy is separate from generic proxy, so when shared memory
+// is the generic proxy to the async proxy we need to insert a fence to ensure
+// memory consistency.
+// This pass analyzes dependencies and will conservatively insert fences to
+// avoid race conditions between proxies. Async proxy is defined here:
+// https://docs.nvidia.com/cuda/parallel-thread-execution/#async-proxy
+//
+// This pass runs after shared memory allocation, to make sure we insert fences
+// between ops accessing aliasing buffers if needed.
+//
+// We also run a fence insertion pass during optimization phase as it is easier
+// to insert fences at optimial location based on structured control flow.
+//
+//===----------------------------------------------------------------------===//
+
+namespace mlir {
+namespace triton {
+namespace nvidia_gpu {
+
+#define GEN_PASS_DEF_TRITONGPUPROXYFENCEINSERTION
+#include "triton/Dialect/TritonNvidiaGPU/Transforms/Passes.h.inc"
+
+namespace {
+
+bool isAsyncProxyWrite(Operation *op) {
+  return isa<triton::nvidia_gpu::AsyncTMACopyGlobalToLocalOp,
+             triton::nvidia_gpu::AsyncTMAGatherOp>(op);
+}
+
+Value getSmemDest(Operation *op) {
+  if (auto asyncTMACopyGlobalToLocalOp =
+          dyn_cast<triton::nvidia_gpu::AsyncTMACopyGlobalToLocalOp>(op)) {
+    return asyncTMACopyGlobalToLocalOp.getResult();
+  }
+  if (auto asyncTMAGatherOp =
+          dyn_cast<triton::nvidia_gpu::AsyncTMAGatherOp>(op)) {
+    return asyncTMAGatherOp.getResult();
+  }
+  return Value();
+}
+
+bool isAsyncProxyRead(Operation *op) {
+  return isa<triton::nvidia_gpu::WarpGroupDotOp,
+             triton::nvidia_gpu::TCGen5MMAOp,
+             triton::nvidia_gpu::TCGen5MMAScaledOp,
+             triton::nvidia_gpu::AsyncTMACopyGlobalToLocalOp,
+             triton::nvidia_gpu::AsyncTMAScatterOp,
+             triton::nvidia_gpu::AsyncTMAReduceOp>(op);
+}
+
+bool ignoreOpForProxyFence(Operation *op) {
+  return isAsyncProxyRead(op) || isAsyncProxyWrite(op) ||
+         isa<triton::nvidia_gpu::ArriveBarrierOp,
+             triton::nvidia_gpu::TMEMCopyOp, triton::nvidia_gpu::WaitBarrierOp,
+             triton::nvidia_gpu::InitBarrierOp,
+             triton::nvidia_gpu::InvalBarrierOp>(op);
+}
+
+bool filterFn(Operation *op, Operation *other) {
+  return ignoreOpForProxyFence(other);
+}
+
+//===----------------------------------------------------------------------===//
+// Proxy Fence Analysis
+//===----------------------------------------------------------------------===//
+class ProxyFenceAnalysis : public MembarOrFenceAnalysis {
+
+public:
+  ProxyFenceAnalysis() = default;
+  explicit ProxyFenceAnalysis(Allocation *allocation, MembarFilterFn filter)
+      : MembarOrFenceAnalysis(allocation, filter) {}
+
+private:
+  /// Updates the BlockInfo operation based on the operation.
+  virtual void update(Operation *operation, BlockInfo *blockInfo,
+                      FuncBlockInfoMapT *funcBlockInfoMap,
+                      OpBuilder *builder) override;
+
+  void insertFence(Operation *operation, OpBuilder *builder);
+};
+
+void ProxyFenceAnalysis::insertFence(Operation *op, OpBuilder *builder) {
+  OpBuilder::InsertionGuard g(*builder);
+  builder->create<triton::nvidia_gpu::FenceAsyncSharedOp>(op->getLoc(), false);
+}
+
+void ProxyFenceAnalysis::update(Operation *op, BlockInfo *blockInfo,
+                                FuncBlockInfoMapT *funcBlockInfoMap,
+                                OpBuilder *builder) {
+  if (isa<triton::nvidia_gpu::FenceAsyncSharedOp>(op)) {
+    // If the current op is a fence, we clear previous reads and writes
+    blockInfo->sync();
+    return;
+  }
+  BlockInfo curBlockInfo;
+  BlockInfo proxyBlockInfo;
+
+  auto scratchBufferId = Allocation::InvalidBufferId;
+  if (isa<triton::CallOp>(op)) {
+    // Inter-function dependencies
+    auto callOpInterface = dyn_cast<CallOpInterface>(op);
+    if (auto callee =
+            dyn_cast<FunctionOpInterface>(callOpInterface.resolveCallable()))
+      curBlockInfo = funcBlockInfoMap->lookup(callee);
+  } else {
+    // Intra-function dependencies
+    if (auto memoryEffectOpInterface = dyn_cast<MemoryEffectOpInterface>(op)) {
+      // Explicit buffer
+      SmallVector<SideEffects::EffectInstance<MemoryEffects::Effect>>
+          effectInstances;
+      memoryEffectOpInterface.getEffects(effectInstances);
+      for (auto effectInstance : effectInstances) {
+        if (auto value = effectInstance.getValue()) {
+          for (auto bufferId : allocation->getBufferIds(value)) {
+            if (bufferId != Allocation::InvalidBufferId) {
+              // TODO: handle proxy read cases. Those are currently handled in
+              // FenceInsertionPass where it can generate better placement for
+              // the fence. But we should support a safe fallback here.
+              if (isAsyncProxyWrite(op)) {
+                if (value == getSmemDest(op)) {
+                  proxyBlockInfo
+                      .syncWriteIntervals[allocation->getAllocatedInterval(
+                          bufferId)]
+                      .insert(op);
+                }
+              } else if (isa<MemoryEffects::Write>(
+                             effectInstance.getEffect())) {
+                curBlockInfo
+                    .syncWriteIntervals[allocation->getAllocatedInterval(
+                        bufferId)]
+                    .insert(op);
+              } else if (isa<MemoryEffects::Read>(effectInstance.getEffect())) {
+                curBlockInfo
+                    .syncReadIntervals[allocation->getAllocatedInterval(
+                        bufferId)]
+                    .insert(op);
+              }
+            }
+          }
+        }
+      }
+    }
+    scratchBufferId = allocation->getBufferId(op);
+  }
+
+  // Scratch buffer operations consist of a series of shared memory operations
+  // starting from a shared memory write, followed by a series of shared memory
+  // read/write operations, mark them as a read.
+  if (scratchBufferId != Allocation::InvalidBufferId) {
+    auto interval = allocation->getAllocatedInterval(scratchBufferId);
+    curBlockInfo.syncReadIntervals[interval].insert(op);
+  }
+  if (isAsyncProxyWrite(op) || isAsyncProxyRead(op)) {
+    if (proxyBlockInfo.isIntersected(*blockInfo, filter)) {
+      builder->setInsertionPoint(op);
+      insertFence(op, builder);
+      blockInfo->sync();
+    }
+  }
+
+  // Update the region info, even if barrier is inserted, we have to maintain
+  // the current op's read/write buffers.
+  blockInfo->join(curBlockInfo);
+}
+} // namespace
+
+struct ProxyFenceInsertionPass
+    : public impl::TritonGPUProxyFenceInsertionBase<ProxyFenceInsertionPass> {
+
+public:
+  using impl::TritonGPUProxyFenceInsertionBase<
+      ProxyFenceInsertionPass>::TritonGPUProxyFenceInsertionBase;
+  void runOnOperation() override {
+    // Only insert fences for compute capability 9.0
+    if (computeCapability < 90)
+      return;
+    ModuleOp mod = getOperation();
+    ModuleAllocation allocation(mod);
+    ModuleMembarOrFenceAnalysis<ProxyFenceAnalysis> analysis(&allocation,
+                                                             filterFn);
+    analysis.run();
+  }
+};
+
+} // namespace nvidia_gpu
+} // namespace triton
+} // namespace mlir