facebookexperimental
diff --git a/‎include/triton/Analysis/Allocation.h‎
Lines changed: 13 additions & 0 deletions b/‎include/triton/Analysis/Allocation.h‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/AllocateSharedMemoryUtility.h‎
Lines changed: 4 additions & 0 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/AllocateSharedMemoryUtility.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/Dialect.h‎
Lines changed: 2 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/IR/Dialect.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Passes.td‎
Lines changed: 5 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Passes.td‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Schedule.h‎
Lines changed: 5 additions & 1 deletion b/‎include/triton/Dialect/TritonGPU/Transforms/Schedule.h‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h‎
Lines changed: 5 additions & 1 deletion b/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/Transforms/Utility.h‎
Lines changed: 5 additions & 1 deletion b/‎include/triton/Dialect/TritonNvidiaGPU/Transforms/Utility.h‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎include/triton/Tools/Sys/GetEnv.hpp‎
Lines changed: 1 addition & 0 deletions b/‎include/triton/Tools/Sys/GetEnv.hpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lib/Analysis/Allocation.cpp‎
Lines changed: 23 additions & 9 deletions b/‎lib/Analysis/Allocation.cpp‎
Lines changed: 23 additions & 9 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/AllocateSharedMemoryUtility.cpp‎
Lines changed: 84 additions & 0 deletions b/‎lib/Conversion/TritonGPUToLLVM/AllocateSharedMemoryUtility.cpp‎
Lines changed: 84 additions & 0 deletions
@@ -13,6 +13,8 @@ namespace mlir {
 
 namespace triton {
 class AllocationAnalysis;
+class MemoryPlanner;
+class MemoryPlannerTmem;
 
 /// Callback to allow backends to specify target-specific scratch sizes for
 /// some operations.
@@ -154,6 +156,15 @@ class Allocation {
     size_t alignment;
     size_t offset;
 
+    // For MemoryPlannerTmem
+    bool isOwnerOfSpace;
+    size_t rowOffset;
+    size_t colOffset;
+    size_t rowSize;
+    size_t colSize;
+    size_t reuseOffset;  // when isOwnerOfSpace is true
+    BufferT *reuseOwner; // when isOwnerOfSpace is false
+
     bool operator==(const BufferT &other) const { return id == other.id; }
     bool operator<(const BufferT &other) const { return id < other.id; }
 
@@ -208,6 +219,8 @@ class Allocation {
   size_t bufferIdCounter = 0;
 
   friend class triton::AllocationAnalysis;
+  friend class triton::MemoryPlanner;
+  friend class triton::MemoryPlannerTmem;
 };
 
 /// Static analysis that computes the allocation of shared memory buffers
 
@@ -12,6 +12,10 @@ namespace mlir::triton::gpu {
 void attachAllocationSizeAndOffsetAttr(ModuleOp mod,
                                        ModuleAllocation &allocation);
 
+/// Add shared memory access annotations to all operations that use shared
+/// memory Only adds annotations when MLIR_ENABLE_DUMP=1 is set.
+void addSharedMemoryAnnotations(ModuleOp mod);
+
 } // namespace mlir::triton::gpu
 
 #endif // TRITON_CONVERSION_TRITON_GPU_TO_LLVM_ALLOCATE_UTILITY_H_
@@ -45,6 +45,8 @@ constexpr static char AttrNumWarpsName[] = "ttg.num-warps";
 constexpr static char AttrNumCTAsName[] = "ttg.num-ctas";
 constexpr static char AttrTargetName[] = "ttg.target";
 constexpr static char AttrNumThreadsPerWarp[] = "ttg.threads-per-warp";
+constexpr static char AttrMinRegAutoWSName[] = "ttg.min_reg_auto_ws";
+constexpr static char AttrMaxRegAutoWSName[] = "ttg.max_reg_auto_ws";
 
 // Find the contextual number of warps on which this operation is executed.
 int lookupNumWarps(Operation *op);
 
@@ -47,6 +47,11 @@ def TritonGPUScheduleLoops : Pass<"tritongpu-schedule-loops", "mlir::ModuleOp">
     The `tritongpu-schedule-loops` pass performs scheduling for loop pipelining
     for loops with latency ops.
   }];
+
+  let options = [
+    Option<"numStages", "num-stages", "int32_t", /*default*/"3",
+           "number of pipeline stages">
+  ];
 }
 
 def TritonGPUHoistTMEMAlloc : Pass<"tritongpu-hoist-tmem-alloc", "mlir::ModuleOp"> {
 
@@ -20,6 +20,10 @@ void lowerLoops(ModuleOp moduleOp);
 
 bool hasGpuBarriers(scf::ForOp forOp);
 bool isSafeToPipeline(scf::ForOp forOp);
+// Do any preprocessing on the loop information for a given module.
+void doLoopSchedulePreprocessing(ModuleOp moduleOp, Builder &builder);
+// TODO: Remove me and move to pass structure.
+void scheduleLoops(ModuleOp moduleOp, int defaultNumStages);
 llvm::MapVector<Operation *, std::pair<int, Operation *>>
 loadOpsToIndirectionLevel(scf::ForOp forOp, bool pipelineWithoutDot,
                           triton::ModuleAxisInfoAnalysis &axisInfoAnalysis,
@@ -155,7 +159,7 @@ class CoarseSchedule {
   auto begin() const { return opToStageAndCluster.begin(); }
 
   // Set <stage, cluster> based on CoarseSchedule.
-  void serialize(scf::ForOp &forOp) const;
+  void serialize(scf::ForOp &forOp, bool keepExistingMaxStage = true) const;
   // Create a CoarseSchedule based on forOp's <stage, cluster>.
   LogicalResult deSerialize(scf::ForOp &forOp);
 
 
@@ -183,7 +183,11 @@ LogicalResult getConvertBackwardSlice(
         nullptr);
 
 // Populate pattern to remove dead cycles in ForOp.
-void populateForOpDeadArgumentElimination(RewritePatternSet &patterns);
+// opsCanBeTriviallyDead specifies the operations of which the side effect can
+// be ignored.
+void populateForOpDeadArgumentElimination(
+    RewritePatternSet &patterns,
+    const DenseSet<Operation *> &opsCanBeTriviallyDead = {});
 
 // Convert an \param index to a multi-dim coordinate given \param shape and
 // \param order.
 
@@ -1,14 +1,18 @@
 #ifndef TRITON_DIALECT_TRITONNVIDIAGPU_TRANSFORMS_UTILITY_H_
 #define TRITON_DIALECT_TRITONNVIDIAGPU_TRANSFORMS_UTILITY_H_
 
+#include "triton/Analysis/Allocation.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 
 namespace mlir::triton::nvidia_gpu {
 
 LogicalResult verifyBarrierType(Operation *op,
                                 mlir::triton::gpu::MemDescType barrierType);
+int allocateTMemWithInterval(
+    DenseMap<Operation *, Interval<int>> &allocToIntervals,
+    SmallVector<Operation *> &allocOrder);
 
-}
+} // namespace mlir::triton::nvidia_gpu
 
 #endif // TRITON_DIALECT_TRITONNVIDIAGPU_TRANSFORMS_UTILITY_H_
@@ -39,6 +39,7 @@ inline const std::set<std::string> CACHE_INVALIDATING_ENV_VARS = {
     "TRITON_LLVM_DEBUG_ONLY",
     "TRITON_ENABLE_ASAN",
     "TRITON_OVERRIDE_ARCH",
+    "TRITON_USE_OAI_WS",
     "USE_IR_LOC",
     "NVPTX_ENABLE_DUMP",
     "ALLOW_LHS_TMEM_LAYOUT_CONVERSION",
 
@@ -302,10 +302,16 @@ class AllocationAnalysis {
           continue;
         }
 
-        // Any scratch memory's live range is the current operation's live
-        // range.
-        bufferRange.insert(
-            {buffer, Interval(operationId.at(op), operationId.at(op) + 1)});
+        if (op && isa<mlir::triton::gpu::WarpSpecializeOp>(op)) {
+          bufferRange.insert(
+              {buffer, Interval((size_t)0, (size_t)operationId.size())});
+        } else {
+
+          // Any scratch memory's live range is the current operation's live
+          // range.
+          bufferRange.insert(
+              {buffer, Interval(operationId.at(op), operationId.at(op) + 1)});
+        }
         LLVM_DEBUG({
           llvm::dbgs() << "-- buffer " << buffer->id << "; value: ";
           op->dump();
@@ -341,15 +347,23 @@ class AllocationAnalysis {
     // Analyze liveness of explicit buffers
     Liveness liveness(operation);
     auto getValueLivenessRange = [&](Value value) {
+      Operation *defOp = value.getDefiningOp();
       auto liveOperations = liveness.resolveLiveness(value);
       auto minId = std::numeric_limits<size_t>::max();
       auto maxId = std::numeric_limits<size_t>::min();
       llvm::for_each(liveOperations, [&](Operation *liveOp) {
-        if (operationId[liveOp] < minId) {
-          minId = operationId[liveOp];
-        }
-        if ((operationId[liveOp] + 1) > maxId) {
-          maxId = operationId[liveOp] + 1;
+        if (liveOp && isa<mlir::triton::gpu::WarpSpecializeOp>(liveOp)) {
+          minId = 0;
+          if ((operationId[liveOp] + 1) > maxId) {
+            maxId = operationId[liveOp] + 1;
+          }
+        } else {
+          if (operationId[liveOp] < minId) {
+            minId = operationId[liveOp];
+          }
+          if ((operationId[liveOp] + 1) > maxId) {
+            maxId = operationId[liveOp] + 1;
+          }
         }
       });
       return Interval(minId, maxId);
 
@@ -1,7 +1,91 @@
 #include "triton/Conversion/TritonGPUToLLVM/AllocateSharedMemoryUtility.h"
+#include "triton/Analysis/Allocation.h"
+#include "triton/Dialect/TritonGPU/IR/Dialect.h"
+#include "triton/Tools/Sys/GetEnv.hpp"
+#include <cstdlib>
+#include <string>
 
 namespace mlir::triton::gpu {
 
+// Helper function to compute allocation size from MemDescType
+inline size_t computeAllocationSize(MemDescType memdescTy) {
+  auto elemTy = memdescTy.getElementType();
+  auto shape = memdescTy.getShape();
+  size_t elemSize = elemTy.getIntOrFloatBitWidth() / 8;
+  size_t totalElements = 1;
+  for (auto dim : shape) {
+    totalElements *= dim;
+  }
+  return totalElements * elemSize;
+}
+
+// Helper function to add allocation information as IR annotations
+void addAllocationAnnotations(Operation *op) {
+  MLIRContext *ctx = op->getContext();
+  IntegerAttr offsetAttr;
+  MemDescType memdescTy;
+
+  // Try to get allocation.offset from the operation itself
+  if (auto attr = op->getAttrOfType<IntegerAttr>("allocation.offset")) {
+    offsetAttr = attr;
+    // Find MemDescType from result or operands
+    for (auto result : op->getResults()) {
+      if (auto ty = dyn_cast<MemDescType>(result.getType())) {
+        memdescTy = ty;
+        break;
+      }
+    }
+    if (!memdescTy) {
+      for (auto operand : op->getOperands()) {
+        if (auto ty = dyn_cast<MemDescType>(operand.getType())) {
+          memdescTy = ty;
+          break;
+        }
+      }
+    }
+  } else {
+    // Try to find it through operands
+    for (auto operand : op->getOperands()) {
+      if (auto definingOp = operand.getDefiningOp()) {
+        if (auto allocOp = dyn_cast<triton::gpu::LocalAllocOp>(definingOp)) {
+          if (auto attr =
+                  allocOp->getAttrOfType<IntegerAttr>("allocation.offset")) {
+            offsetAttr = attr;
+            memdescTy = cast<MemDescType>(allocOp.getType());
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  if (!offsetAttr || !memdescTy) {
+    return;
+  }
+
+  auto offset = offsetAttr.getInt();
+  size_t totalSize = computeAllocationSize(memdescTy);
+  op->setAttr("shared_memory.offset",
+              IntegerAttr::get(IntegerType::get(ctx, 64), offset));
+  op->setAttr("shared_memory.size_bytes",
+              IntegerAttr::get(IntegerType::get(ctx, 64), totalSize));
+}
+
+// Function to add shared memory access annotations to all operations that use
+// shared memory
+void addSharedMemoryAnnotations(ModuleOp mod) {
+  if (!triton::tools::getBoolEnv("MLIR_ENABLE_DUMP")) {
+    return;
+  }
+
+  mod.walk([&](Operation *op) {
+    if (isa<triton::gpu::LocalStoreOp, triton::gpu::LocalLoadOp,
+            triton::gpu::MemDescSubsliceOp, triton::gpu::MemDescIndexOp>(op)) {
+      addAllocationAnnotations(op);
+    }
+  });
+}
+
 void attachAllocationSizeAndOffsetAttr(ModuleOp mod,
                                        ModuleAllocation &allocation) {
   MLIRContext *ctx = mod.getContext();