intel
diff --git a/‎Makefile‎
Lines changed: 2 additions & 2 deletions b/‎Makefile‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎bin/RegisterTritonDialects.h‎
Lines changed: 3 additions & 2 deletions b/‎bin/RegisterTritonDialects.h‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎include/triton/Analysis/Allocation.h‎
Lines changed: 13 additions & 12 deletions b/‎include/triton/Analysis/Allocation.h‎
Lines changed: 13 additions & 12 deletions
diff --git a/‎include/triton/Analysis/Membar.h‎
Lines changed: 4 additions & 1 deletion b/‎include/triton/Analysis/Membar.h‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Passes.h‎
Lines changed: 2 additions & 11 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/Passes.h‎
Lines changed: 2 additions & 11 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Passes.td‎
Lines changed: 24 additions & 11 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/Passes.td‎
Lines changed: 24 additions & 11 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 6 additions & 3 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/Traits.h‎
Lines changed: 5 additions & 0 deletions b/‎include/triton/Dialect/Triton/IR/Traits.h‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/TritonInterfaces.td‎
Lines changed: 1 addition & 0 deletions b/‎include/triton/Dialect/Triton/IR/TritonInterfaces.td‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td‎
Lines changed: 10 additions & 7 deletions b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td‎
Lines changed: 10 additions & 7 deletions
@@ -89,9 +89,9 @@ dev-install: dev-install-requires dev-install-triton
 
 .PHONY: golden-samples
 golden-samples: triton-opt
-	$(TRITON_OPT) test/TritonGPU/samples/simulated-grouped-gemm.mlir.in -tritongpu-loop-scheduling -tritongpu-pipeline -canonicalize | \
+	$(TRITON_OPT) test/TritonGPU/samples/simulated-grouped-gemm.mlir.in -tritongpu-pipeline -canonicalize | \
 		$(PYTHON) utils/generate-test-checks.py --source test/TritonGPU/samples/simulated-grouped-gemm.mlir.in --source_delim_regex="\bmodule" \
 		-o test/TritonGPU/samples/simulated-grouped-gemm.mlir
-	$(TRITON_OPT) test/TritonGPU/samples/descriptor-matmul-pipeline.mlir.in -tritongpu-loop-scheduling -tritongpu-pipeline -canonicalize | \
+	$(TRITON_OPT) test/TritonGPU/samples/descriptor-matmul-pipeline.mlir.in -tritongpu-pipeline -canonicalize | \
 		$(PYTHON) utils/generate-test-checks.py --source test/TritonGPU/samples/descriptor-matmul-pipeline.mlir.in --source_delim_regex="\bmodule" \
 		-o test/TritonGPU/samples/descriptor-matmul-pipeline.mlir
@@ -68,8 +68,9 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
   mlir::triton::intel::registerConvertTritonToTritonGPUWarpPass();
   mlir::triton::intel::registerTritonIntelRemoveMasks();
   mlir::triton::intel::registerTritonRaiseBlockPointer();
-  mlir::triton::registerAllocateSharedMemoryPass();
-  mlir::triton::registerTritonGPUGlobalScratchAllocationPass();
+  mlir::triton::gpu::registerAllocateSharedMemoryPass();
+  mlir::triton::gpu::registerTritonGPUAllocateWarpGroups();
+  mlir::triton::gpu::registerTritonGPUGlobalScratchAllocationPass();
   mlir::triton::registerConvertTritonGPUToLLVMPass();
   mlir::triton::registerConvertNVGPUToLLVMPass();
   mlir::registerLLVMDIScope();
 
@@ -191,22 +191,19 @@ class Allocation {
     /// Virtual: triton.call
     enum class BufferKind { Explicit, Scratch, Virtual };
 
-    /// MT: thread-safe
-    inline static std::atomic<BufferId> nextId = 0;
-
     BufferKind kind;
     BufferId id;
+    Operation *owner;
     size_t size;
     size_t alignment;
     size_t offset;
 
     bool operator==(const BufferT &other) const { return id == other.id; }
     bool operator<(const BufferT &other) const { return id < other.id; }
 
-    BufferT() : BufferT(BufferKind::Explicit, 0) {}
-    BufferT(BufferKind kind, size_t size, size_t alignment = 4,
-            size_t offset = 0)
-        : kind(kind), id(nextId++), size(size), alignment(alignment),
+    BufferT(BufferKind kind, BufferId id, Operation *owner, size_t size,
+            size_t alignment = 4, size_t offset = 0)
+        : kind(kind), id(id), owner(owner), size(size), alignment(alignment),
           offset(offset) {}
 
     size_t setOffsetAligned(size_t newOffset) {
@@ -226,14 +223,16 @@ class Allocation {
 private:
   template <BufferT::BufferKind Kind, typename KeyType, typename... Args>
   void addBuffer(KeyType &key, Args &&...args) {
-    auto buffer = BufferT(Kind, std::forward<Args>(args)...);
-    bufferSet[buffer.id] = std::move(buffer);
+    BufferId nextId = bufferIdCounter++;
+    auto [it, inserted] = bufferSet.insert_or_assign(
+        nextId, BufferT(Kind, nextId, key, std::forward<Args>(args)...));
+    BufferT *buffer = &it->second;
     if constexpr (Kind == BufferT::BufferKind::Explicit) {
-      valueBuffer[key] = &bufferSet[buffer.id];
+      valueBuffer[key] = buffer;
     } else if constexpr (Kind == BufferT::BufferKind::Virtual) {
-      opVirtual[key] = &bufferSet[buffer.id];
+      opVirtual[key] = buffer;
     } else {
-      opScratch[key] = &bufferSet[buffer.id];
+      opScratch[key] = buffer;
     }
   }
 
@@ -250,6 +249,8 @@ class Allocation {
   BufferSetT bufferSet;
   size_t sharedMemorySize = 0;
 
+  size_t bufferIdCounter = 0;
+
   friend class triton::AllocationAnalysis;
 };
 
 
@@ -97,6 +97,8 @@ struct BlockInfo {
 // Shared Memory Barrier Analysis
 //===----------------------------------------------------------------------===//
 class MembarAnalysis {
+  using VirtualBlock = std::pair<Block *, Block::iterator>;
+
 public:
   using FuncBlockInfoMapT = CallGraph<BlockInfo>::FuncDataMapT;
   /// Creates a new Membar analysis that generates the shared memory barrier
@@ -143,7 +145,8 @@ class MembarAnalysis {
               FuncBlockInfoMapT *funcBlockInfoMap, OpBuilder *builder);
 
   /// Collects the successors of the terminator
-  void visitTerminator(Operation *operation, SmallVector<Block *> &successors);
+  void visitTerminator(Operation *operation,
+                       SmallVector<VirtualBlock> &successors);
 
   void insertBarrier(Operation *operation, OpBuilder *builder);
 
 
@@ -1,9 +1,7 @@
 #ifndef TRITONGPU_CONVERSION_TRITONGPUTOLLVM_PASSES_H
 #define TRITONGPU_CONVERSION_TRITONGPUTOLLVM_PASSES_H
 
-#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
 #include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/DialectConversion.h"
 
 #include <memory>
 
@@ -12,22 +10,15 @@ namespace mlir {
 class ModuleOp;
 template <typename T> class OperationPass;
 
-namespace triton {
+namespace triton::gpu {
 
 #define GEN_PASS_DECL
 #include "triton/Conversion/TritonGPUToLLVM/Passes.h.inc"
 
-namespace gpu {
-std::unique_ptr<OperationPass<ModuleOp>> createAllocateSharedMemoryPass();
-
-std::unique_ptr<Pass> createTritonGPUGlobalScratchAllocationPass();
-
-} // namespace gpu
-
 #define GEN_PASS_REGISTRATION
 #include "triton/Conversion/TritonGPUToLLVM/Passes.h.inc"
 
-} // namespace triton
+} // namespace triton::gpu
 
 } // namespace mlir
 
 
@@ -4,15 +4,14 @@
 include "mlir/Pass/PassBase.td"
 
 def AllocateSharedMemory : Pass<"allocate-shared-memory", "mlir::ModuleOp"> {
-    let summary = "Add metadata for shared memory allocation";
-    let description = [{
-      This pass uses the `ModuleAllocation` analysis to:
-        - Annotate modules with an attribute with the amount of shared/local
-          memory used.
-        - Annotate operations with an offset into the total shared/local memory.
-     }];
-
-    let constructor = "mlir::triton::gpu::createAllocateSharedMemoryPass()";
+  let summary = "Add metadata for shared memory allocation";
+
+  let description = [{
+    This pass uses the `ModuleAllocation` analysis to:
+      - Annotate modules with an attribute with the amount of shared/local
+        memory used.
+      - Annotate operations with an offset into the total shared/local memory.
+  }];
 }
 
 def TritonGPUGlobalScratchAllocationPass : Pass<"tritongpu-global-scratch-memory-allocation", "mlir::ModuleOp"> {
@@ -22,11 +21,25 @@ def TritonGPUGlobalScratchAllocationPass : Pass<"tritongpu-global-scratch-memory
     Decide on global scratch space memory allocation and assign attributes to each allocation.
   }];
 
-  let constructor = "mlir::triton::gpu::createTritonGPUGlobalScratchAllocationPass()";
-
   let dependentDialects = [
     "mlir::triton::gpu::TritonGPUDialect"
   ];
 }
 
+def TritonGPUAllocateWarpGroups : Pass<"tritongpu-allocate-warp-groups", "mlir::ModuleOp"> {
+  let summary = "Allocate warp groups";
+
+  let description = [{
+    The `tritongpu-allocate-warp-groups` pass performs warpgroup allocation for
+    a GPU program. When a GPU program contains warp specialization, additional
+    warps are launched in addition to the "default" warp group. The "default"
+    warpgroup executes top-level code in a `tt.func` and its size is specified
+    by the user via the `num_warps` argument.
+
+    This pass analyzes `ttg.warp_specialize` ops in the program and determines
+    the total number of needed warps, then attaches the range of warp IDs to
+    each warpgroup function.
+  }];
+}
+
 #endif
@@ -575,15 +575,18 @@ Value mxfpScaleBf16(RewriterBase &rewriter, Location loc, Value v, Value scale,
 // Hardware Indices
 // -----------------------------------------------------------------------
 
+// If an operation is contained within a warp specialize region, this returns
+// the thread ID offset of that warpgroup.
+std::optional<int> getWarpGroupStartThreadId(Block *block);
+
 // Returns CTA level thread ID.
 Value getThreadId(OpBuilder &rewriter, Location loc);
 
 // Get the lane ID, which is index of the thread within its warp.
-Value getLaneId(OpBuilder &rewriter, Location loc, unsigned threadsPerWarp);
+Value getLaneId(OpBuilder &rewriter, Location loc);
 
 // Get the lane ID and warp ID.
-std::pair<Value, Value> getLaneAndWarpId(OpBuilder &rewriter, Location loc,
-                                         unsigned threadsPerWarp);
+std::pair<Value, Value> getLaneAndWarpId(OpBuilder &rewriter, Location loc);
 
 // -----------------------------------------------------------------------
 // Shared memory utilities
 
@@ -114,6 +114,11 @@ class SameLoadStoreOperandsAndResultEncoding
   }
 };
 
+// This trait indicates that regions in the op may execute concurrently with
+// each other.
+template <typename ConcreteType>
+struct AsyncRegions : public TraitBase<ConcreteType, AsyncRegions> {};
+
 } // namespace OpTrait
 } // namespace mlir
 
 
@@ -12,6 +12,7 @@ def SameLoadStoreOperandsShape : NativeOpTrait<"SameLoadStoreOperandsShape">;
 def SameLoadStoreOperandsAndResultShape : NativeOpTrait<"SameLoadStoreOperandsAndResultShape">;
 def SameLoadStoreOperandsEncoding : NativeOpTrait<"SameLoadStoreOperandsEncoding">;
 def SameLoadStoreOperandsAndResultEncoding : NativeOpTrait<"SameLoadStoreOperandsAndResultEncoding">;
+def AsyncRegions : NativeOpTrait<"AsyncRegions">;
 
 // A trait equivalent to InferTypeOpAdaptor, but that checks for structural
 // equivalence of the layouts of the result rather than just layout equality.
 
@@ -333,7 +333,7 @@ def TTG_GlobalScratchAllocOp : TTG_Op<"global_scratch_alloc", [MemoryEffects<[Me
 }
 
 def TTG_WarpSpecializeOp : TTG_Op<"warp_specialize", [
-  RecursiveMemoryEffects, RecursivelySpeculatable,
+  RecursiveMemoryEffects, RecursivelySpeculatable, AsyncRegions,
   DeclareOpInterfaceMethods<RegionBranchOpInterface>
 ]> {
   let summary = "asynchronously execute code on multiple warpgroups";
@@ -362,21 +362,24 @@ def TTG_WarpSpecializeOp : TTG_Op<"warp_specialize", [
     }
     partition0(%arg0: i32, %arg1: i32) num_warps(8) {
       some_async_dispatch(%arg0, %arg1)
+      ttg.warp_return
     }
     partition1(%arg0: i32, %arg1: i32) num_warps(1) {
       some_async_dispatch(%arg0, %arg1)
+      ttg.warp_return
     } : (i32, i32) -> i32
     ```
   }];
 
   let arguments = (ins
     Variadic<AnyType>:$explicitCaptures,
-    DenseI32ArrayAttr:$partitionNumWarps
+    DenseI32ArrayAttr:$partitionNumWarps,
+    OptionalAttr<DenseI32ArrayAttr>:$warpGroupStartIds
   );
   let results = (outs Variadic<AnyType>:$defaultPassthrough);
 
   let regions = (region
-    SizedRegion<1>:$defaultRegion,
+    MinSizedRegion<1>:$defaultRegion,
     SizedRegion<1>:$partitionOpHolder
   );
 
@@ -390,20 +393,19 @@ def TTG_WarpSpecializeOp : TTG_Op<"warp_specialize", [
 
 def TTG_WarpSpecializePartitionsOp : TTG_Op<"warp_specialize.partitions", [
   IsolatedFromAbove, RecursiveMemoryEffects, RecursivelySpeculatable,
-  Terminator, HasParent<"WarpSpecializeOp">,
-  SingleBlockImplicitTerminator<"WarpReturnOp">
+  Terminator, HasParent<"WarpSpecializeOp">
 ]> {
   let summary = "container op for `ttg.warp_specialize`";
   let description = [{
     Because MLIR requires entire operations be isolated from above, this op
     contains the actual isolated from above regions of `ttg.warp_specialize`.
   }];
 
-  let regions = (region VariadicRegion<SizedRegion<1>>:$partitionRegions);
+  let regions = (region VariadicRegion<MinSizedRegion<1>>:$partitionRegions);
 }
 
 def TTG_WarpYieldOp : TTG_Op<"warp_yield", [
-  Pure, Terminator, HasParent<"WarpSpecializeOp">,
+  Pure, Terminator, ReturnLike, HasParent<"WarpSpecializeOp">,
   DeclareOpInterfaceMethods<RegionBranchTerminatorOpInterface>
 ]> {
   let summary = "yield from the default region of `ttg.warp_specialize`";
@@ -422,6 +424,7 @@ def TTG_WarpYieldOp : TTG_Op<"warp_yield", [
   let arguments = (ins Variadic<AnyType>:$values);
 
   let assemblyFormat = "($values^)? attr-dict (`:` type($values)^)?";
+  let hasVerifier = 1;
 }
 
 def TTG_WarpReturnOp : TTG_Op<"warp_return", [