intel
diff --git a/‎include/triton/Analysis/Membar.h‎
Lines changed: 32 additions & 14 deletions b/‎include/triton/Analysis/Membar.h‎
Lines changed: 32 additions & 14 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h‎
Lines changed: 4 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/Transforms/Passes.td‎
Lines changed: 22 additions & 0 deletions b/‎include/triton/Dialect/TritonNvidiaGPU/Transforms/Passes.td‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎lib/Analysis/Membar.cpp‎
Lines changed: 6 additions & 6 deletions b/‎lib/Analysis/Membar.cpp‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/SoftwarePipeliner.cpp‎
Lines changed: 4 additions & 4 deletions b/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/SoftwarePipeliner.cpp‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp‎
Lines changed: 7 additions & 1 deletion b/‎lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Utility.cpp‎
Lines changed: 32 additions & 0 deletions b/‎lib/Dialect/TritonGPU/Transforms/Utility.cpp‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎lib/Dialect/TritonNvidiaGPU/Transforms/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎lib/Dialect/TritonNvidiaGPU/Transforms/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lib/Dialect/TritonNvidiaGPU/Transforms/OptimizeDescriptorEncoding.cpp‎
Lines changed: 0 additions & 32 deletions b/‎lib/Dialect/TritonNvidiaGPU/Transforms/OptimizeDescriptorEncoding.cpp‎
Lines changed: 0 additions & 32 deletions
@@ -95,7 +95,9 @@ struct BlockInfo {
 //===----------------------------------------------------------------------===//
 // Shared Memory Barrier Analysis
 //===----------------------------------------------------------------------===//
-class MembarAnalysis {
+
+// Common class to analyze membar and fence placement.
+class MembarOrFenceAnalysis {
   using VirtualBlock = std::pair<Block *, Block::iterator>;
 
 public:
@@ -113,15 +115,15 @@ class MembarAnalysis {
   /// a shared memory read. If the temporary storage is written but not read,
   /// it is considered as the problem of the operation itself but not the membar
   /// analysis.
-  MembarAnalysis() = default;
-  explicit MembarAnalysis(Allocation *allocation, MembarFilterFn filter)
+  MembarOrFenceAnalysis() = default;
+  explicit MembarOrFenceAnalysis(Allocation *allocation, MembarFilterFn filter)
       : allocation(allocation), filter(filter) {}
 
   /// Runs the membar analysis to the given operation, inserts a barrier if
   /// necessary.
   void run(FuncBlockInfoMapT &funcBlockInfoMap);
 
-private:
+protected:
   /// Applies the barrier analysis based on the SCF dialect, in which each
   /// region has a single basic block only.
   /// Example:
@@ -139,30 +141,44 @@ class MembarAnalysis {
   void resolve(FunctionOpInterface funcOp, FuncBlockInfoMapT *funcBlockInfoMap,
                OpBuilder *builder);
 
-  /// Updates the BlockInfo operation based on the operation.
-  void update(Operation *operation, BlockInfo *blockInfo,
-              FuncBlockInfoMapT *funcBlockInfoMap, OpBuilder *builder);
-
   /// Collects the successors of the terminator
   void visitTerminator(Operation *operation,
                        SmallVector<VirtualBlock> &successors);
 
-  void insertBarrier(Operation *operation, OpBuilder *builder);
+  /// Updates the BlockInfo operation based on the operation.
+  virtual void update(Operation *operation, BlockInfo *blockInfo,
+                      FuncBlockInfoMapT *funcBlockInfoMap,
+                      OpBuilder *builder) = 0;
 
-private:
   Allocation *allocation = nullptr;
   MembarFilterFn filter = nullptr;
 };
 
+class MembarAnalysis : public MembarOrFenceAnalysis {
+public:
+  MembarAnalysis() = default;
+  explicit MembarAnalysis(Allocation *allocation, MembarFilterFn filter)
+      : MembarOrFenceAnalysis(allocation, filter) {}
+
+private:
+  /// Updates the BlockInfo operation based on the operation.
+  virtual void update(Operation *operation, BlockInfo *blockInfo,
+                      FuncBlockInfoMapT *funcBlockInfoMap,
+                      OpBuilder *builder) override;
+
+  void insertBarrier(Operation *operation, OpBuilder *builder);
+};
+
 /// Postorder traversal on the callgraph to insert membar instructions
 /// of each function.
 /// Each function maintains a BlockInfo map that includes all potential buffers
 /// after returning. This way users do not have to explicitly insert membars
 /// before and after function calls, but might be a bit conservative.
-class ModuleMembarAnalysis : public CallGraph<BlockInfo> {
+template <typename AnalysisType>
+class ModuleMembarOrFenceAnalysis : public CallGraph<BlockInfo> {
 public:
-  ModuleMembarAnalysis(ModuleAllocation *moduleAllocation,
-                       MembarFilterFn filter = nullptr)
+  ModuleMembarOrFenceAnalysis(ModuleAllocation *moduleAllocation,
+                              MembarFilterFn filter = nullptr)
       : CallGraph<BlockInfo>(moduleAllocation->getModuleOp()),
         moduleAllocation(moduleAllocation), filter(filter) {}
 
@@ -175,7 +191,7 @@ class ModuleMembarAnalysis : public CallGraph<BlockInfo> {
           auto *allocation = moduleAllocation->getFuncData(funcOp);
           auto [it, inserted] = funcMap.try_emplace(funcOp, BlockInfo());
           if (inserted) {
-            MembarAnalysis analysis(allocation, filter);
+            AnalysisType analysis(allocation, filter);
             analysis.run(funcMap);
           }
         });
@@ -186,6 +202,8 @@ class ModuleMembarAnalysis : public CallGraph<BlockInfo> {
   MembarFilterFn filter;
 };
 
+typedef ModuleMembarOrFenceAnalysis<MembarAnalysis> ModuleMembarAnalysis;
+
 } // namespace mlir
 
 #endif // TRITON_ANALYSIS_MEMBAR_H
@@ -272,6 +272,10 @@ void replaceUsesWithLocalLoad(
 // after converting loads into async loads.
 bool comesFromLoadOrBlockArg(Value v);
 
+// For structured control flow ops, returns the values associated with the
+// `resultIdx`th result.
+SmallVector<Value> getTiedArgs(Operation *op, int resultIdx);
+
 } // namespace mlir::triton
 
 #endif // TRITON_DIALECT_TRITONGPU_TRANSFORMS_UTILITY_H_
@@ -41,6 +41,28 @@ def TritonGPUPlanCTAPass : Pass<"triton-nvidia-gpu-plan-cta", "mlir::ModuleOp">
 }
 
 def TritonGPUFenceInsertion : Pass<"triton-nvidia-gpu-fence-insertion", "mlir::ModuleOp"> {
+  let summary = "Insert fences across generic and async proxy.";
+
+  let description = [{
+    This pass is to insert memory fences to ensure that memory operations are
+    properly ordered across generic and async operations.
+    This pass inserts fences at optimized location.
+    There is a pass later to handle all the functional requirements
+  }];
+
+  let dependentDialects = [
+    "mlir::triton::gpu::TritonGPUDialect",
+    "mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect"
+  ];
+
+  let options = [
+    Option<"computeCapability", "compute-capability",
+           "int32_t", /*default*/"90",
+           "device compute capability">
+  ];
+}
+
+def TritonGPUProxyFenceInsertion : Pass<"triton-nvidia-gpu-proxy-fence-insertion", "mlir::ModuleOp"> {
   let summary = "Insert fences across generic and async proxy";
 
   let description = [{
 
@@ -8,16 +8,16 @@
 
 namespace mlir {
 
-void MembarAnalysis::run(FuncBlockInfoMapT &funcBlockInfoMap) {
+void MembarOrFenceAnalysis::run(FuncBlockInfoMapT &funcBlockInfoMap) {
   FunctionOpInterface funcOp =
       dyn_cast<FunctionOpInterface>(allocation->getOperation());
   OpBuilder builder(funcOp.getContext());
   resolve(funcOp, &funcBlockInfoMap, &builder);
 }
 
-void MembarAnalysis::resolve(FunctionOpInterface funcOp,
-                             FuncBlockInfoMapT *funcBlockInfoMap,
-                             OpBuilder *builder) {
+void MembarOrFenceAnalysis::resolve(FunctionOpInterface funcOp,
+                                    FuncBlockInfoMapT *funcBlockInfoMap,
+                                    OpBuilder *builder) {
   // Initialize the blockList. Operations are organized into "virtual blocks",
   // which represent segments of straight-line code analyzed by each iteration
   // of the dataflow analysis. Virtual blocks abstract over both control flow
@@ -103,8 +103,8 @@ void MembarAnalysis::resolve(FunctionOpInterface funcOp,
   });
 }
 
-void MembarAnalysis::visitTerminator(Operation *op,
-                                     SmallVector<VirtualBlock> &successors) {
+void MembarOrFenceAnalysis::visitTerminator(
+    Operation *op, SmallVector<VirtualBlock> &successors) {
   if (isa<BranchOpInterface>(op)) {
     // Collect the block successors of the branch.
     for (Block *successor : op->getSuccessors())
 
@@ -33,12 +33,13 @@ namespace gpu {
 #define GEN_PASS_DEF_TRITONGPUPIPELINE
 #include "triton/Dialect/TritonGPU/Transforms/Passes.h.inc"
 
-static void pipelineWgmma(ModuleOp moduleOp) {
+static void pipelineWgmma(ModuleOp moduleOp, unsigned numStages) {
   SmallVector<scf::ForOp> loops;
   moduleOp->walk([&](scf::ForOp forOp) { loops.push_back(forOp); });
 
   for (scf::ForOp forOp : loops) {
-    mlir::triton::asyncLaunchDots(forOp);
+    if (getNumStagesOrDefault(forOp, numStages) >= 1)
+      mlir::triton::asyncLaunchDots(forOp);
   }
 }
 
@@ -223,7 +224,6 @@ struct PipelinePass : public impl::TritonGPUPipelineBase<PipelinePass> {
 
   void runOnOperation() override {
     ModuleOp moduleOp = getOperation();
-
     // Transform the loop by introducing async operations to prepare it for
     // pipeline expansion.
     lowerLoops(moduleOp);
@@ -244,7 +244,7 @@ struct PipelinePass : public impl::TritonGPUPipelineBase<PipelinePass> {
     // Cleanup the IR from the pipeline attributes.
     removeAttributes(moduleOp);
 
-    pipelineWgmma(moduleOp);
+    pipelineWgmma(moduleOp, numStages);
 
     // schedule the waits
     mlir::triton::updateWaits(getOperation());
 
@@ -1193,7 +1193,7 @@ void LayoutRematerialization::backwardRematerialization(
     } else if (isa<arith::ConstantOp>(op)) {
       // special-case: arith.constant has zero cost
       continue;
-    } else if (isa<LoadOp>(op)) {
+    } else if (isa<LoadOp>(op) || isa<LocalLoadOp>(op)) {
       // optimistically assume L1-cached:
       for (Value result : op->getResults()) {
         rematerialisationCost += 8 * getByteCount(result);
@@ -1208,6 +1208,12 @@ void LayoutRematerialization::backwardRematerialization(
       for (Value result : op->getResults()) {
         rematerialisationCost += multiplier * getByteCount(result);
       }
+    } else if (isa<ReduceOp>(op)) {
+      // Reduce op introduce much cost.
+      auto reduceOp = dyn_cast<ReduceOp>(op);
+      ReduceOpHelper helper(reduceOp);
+      rematerialisationCost += helper.getIntraWarpSizeWithUniqueData();
+      rematerialisationCost += 8 * helper.getInterWarpSizeWithUniqueData();
     }
   }
 
 
@@ -1586,4 +1586,36 @@ bool comesFromLoadOrBlockArg(Value v) {
           isa<LoadOp, DescriptorLoadOp, DescriptorGatherOp>(v.getDefiningOp()));
 }
 
+SmallVector<Value> getTiedArgs(Operation *op, int resultIdx) {
+  if (auto forOp = dyn_cast<scf::ForOp>(op)) {
+    auto iterArg = forOp.getRegionIterArg(resultIdx);
+    auto result = forOp.getResult(resultIdx);
+    auto yieldVal = forOp.getBody()->getTerminator()->getOperand(resultIdx);
+    auto initVal = forOp.getInitArgs()[resultIdx];
+    return {iterArg, result, yieldVal, initVal};
+  } else if (auto whileOp = dyn_cast<scf::WhileOp>(op)) {
+    auto iterArg = whileOp.getBeforeArguments()[resultIdx];
+    auto result = whileOp.getResults()[resultIdx];
+    auto yieldVal =
+        whileOp.getBeforeBody()->getTerminator()->getOperand(resultIdx);
+    auto initVal = whileOp.getOperands()[resultIdx];
+    return {iterArg, result, iterArg, initVal};
+  } else if (auto ifOp = dyn_cast<scf::IfOp>(op)) {
+    SmallVector<Value> values;
+    for (auto &block : ifOp.getThenRegion().getBlocks()) {
+      auto terminator = block.getTerminator();
+      if (isa<scf::YieldOp>(terminator))
+        values.push_back(terminator->getOperands()[resultIdx]);
+    }
+    for (auto &block : ifOp.getElseRegion().getBlocks()) {
+      auto terminator = block.getTerminator();
+      if (isa<scf::YieldOp>(terminator))
+        values.push_back(terminator->getOperands()[resultIdx]);
+    }
+    values.push_back(ifOp->getResults()[resultIdx]);
+    return values;
+  }
+  return {};
+}
+
 } // namespace mlir::triton
@@ -6,6 +6,7 @@ add_triton_library(TritonNvidiaGPUTransforms
   OptimizeTMemLayouts.cpp
   PlanCTA.cpp
   PromoteLHSToTMem.cpp
+  ProxFenceInsertion.cpp
   RemoveTMEMTokens.cpp
   TensorMemoryAllocation.cpp
   TMALowering.cpp
 
@@ -128,38 +128,6 @@ namespace nvidia_gpu {
 
 namespace {
 
-SmallVector<Value> getTiedArgs(Operation *op, int resultIdx) {
-  if (auto forOp = dyn_cast<scf::ForOp>(op)) {
-    auto iterArg = forOp.getRegionIterArg(resultIdx);
-    auto result = forOp.getResult(resultIdx);
-    auto yieldVal = forOp.getBody()->getTerminator()->getOperand(resultIdx);
-    auto initVal = forOp.getInitArgs()[resultIdx];
-    return {iterArg, result, yieldVal, initVal};
-  } else if (auto whileOp = dyn_cast<scf::WhileOp>(op)) {
-    auto iterArg = whileOp.getBeforeArguments()[resultIdx];
-    auto result = whileOp.getResults()[resultIdx];
-    auto yieldVal =
-        whileOp.getBeforeBody()->getTerminator()->getOperand(resultIdx);
-    auto initVal = whileOp.getOperands()[resultIdx];
-    return {iterArg, result, iterArg, initVal};
-  } else if (auto ifOp = dyn_cast<scf::IfOp>(op)) {
-    SmallVector<Value> values;
-    for (auto &block : ifOp.getThenRegion().getBlocks()) {
-      auto terminator = block.getTerminator();
-      if (isa<scf::YieldOp>(terminator))
-        values.push_back(terminator->getOperands()[resultIdx]);
-    }
-    for (auto &block : ifOp.getElseRegion().getBlocks()) {
-      auto terminator = block.getTerminator();
-      if (isa<scf::YieldOp>(terminator))
-        values.push_back(terminator->getOperands()[resultIdx]);
-    }
-    values.push_back(ifOp->getResults()[resultIdx]);
-    return values;
-  }
-  return {};
-}
-
 const EncodingInfo *internEncoding(std::unordered_set<EncodingInfo> &encodings,
                                    EncodingInfo info) {
   return &*encodings.insert(info).first;