ROCm
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/SoftwarePipeliner.cpp‎
Lines changed: 4 additions & 4 deletions b/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/SoftwarePipeliner.cpp‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎python/src/ir.cc‎
Lines changed: 20 additions & 0 deletions b/‎python/src/ir.cc‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎python/src/ir.h‎
Lines changed: 16 additions & 1 deletion b/‎python/src/ir.h‎
Lines changed: 16 additions & 1 deletion
diff --git a/‎python/triton/compiler/code_generator.py‎
Lines changed: 14 additions & 0 deletions b/‎python/triton/compiler/code_generator.py‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎python/triton/language/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎python/triton/language/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎python/triton/language/core.py‎
Lines changed: 16 additions & 0 deletions b/‎python/triton/language/core.py‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎python/tutorials/09-persistent-matmul.py‎
Lines changed: 44 additions & 20 deletions b/‎python/tutorials/09-persistent-matmul.py‎
Lines changed: 44 additions & 20 deletions
diff --git a/‎test/Hopper/WarpSpecialization/ws_code_partition.mlir‎
Lines changed: 45 additions & 0 deletions b/‎test/Hopper/WarpSpecialization/ws_code_partition.mlir‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎third_party/nvidia/backend/compiler.py‎
Lines changed: 1 addition & 0 deletions b/‎third_party/nvidia/backend/compiler.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎third_party/nvidia/hopper/include/Transforms/Passes.td‎
Lines changed: 5 additions & 2 deletions b/‎third_party/nvidia/hopper/include/Transforms/Passes.td‎
Lines changed: 5 additions & 2 deletions
@@ -33,12 +33,13 @@ namespace gpu {
 #define GEN_PASS_DEF_TRITONGPUPIPELINE
 #include "triton/Dialect/TritonGPU/Transforms/Passes.h.inc"
 
-static void pipelineWgmma(ModuleOp moduleOp) {
+static void pipelineWgmma(ModuleOp moduleOp, unsigned numStages) {
   SmallVector<scf::ForOp> loops;
   moduleOp->walk([&](scf::ForOp forOp) { loops.push_back(forOp); });
 
   for (scf::ForOp forOp : loops) {
-    mlir::triton::asyncLaunchDots(forOp);
+    if (getNumStagesOrDefault(forOp, numStages) >= 1)
+      mlir::triton::asyncLaunchDots(forOp);
   }
 }
 
@@ -223,7 +224,6 @@ struct PipelinePass : public impl::TritonGPUPipelineBase<PipelinePass> {
 
   void runOnOperation() override {
     ModuleOp moduleOp = getOperation();
-
     // Transform the loop by introducing async operations to prepare it for
     // pipeline expansion.
     lowerLoops(moduleOp);
@@ -244,7 +244,7 @@ struct PipelinePass : public impl::TritonGPUPipelineBase<PipelinePass> {
     // Cleanup the IR from the pipeline attributes.
     removeAttributes(moduleOp);
 
-    pipelineWgmma(moduleOp);
+    pipelineWgmma(moduleOp, numStages);
 
     // schedule the waits
     mlir::triton::updateWaits(getOperation());
 
@@ -38,6 +38,20 @@
 
 #include "third_party/proton/dialect/include/Dialect/Proton/IR/Dialect.h"
 
+#include "llvm/ADT/SmallVector.h"
+
+void setAsyncTaskIds(mlir::Operation *op,
+                     llvm::ArrayRef<AsyncTaskId> asyncTaskIds) {
+  llvm::SmallVector<AsyncTaskId> sortedAsyncTaskIds(asyncTaskIds.begin(),
+                                                    asyncTaskIds.end());
+  sort(sortedAsyncTaskIds);
+  auto i32Ty = IntegerType::get(op->getContext(), 32);
+  auto size = static_cast<int64_t>(sortedAsyncTaskIds.size());
+  auto vecTy = VectorType::get(size, i32Ty);
+  op->setAttr("async_task_id",
+              DenseI32ArrayAttr::get(op->getContext(), sortedAsyncTaskIds));
+}
+
 namespace {
 
 namespace py = pybind11;
@@ -744,6 +758,12 @@ void init_triton_ir(py::module &&m) {
            [](TritonOpBuilder &self, OpBuilder::InsertPoint pt) {
              self.restoreInsertionPoint(pt);
            })
+      .def("set_async_task_ids",
+           [](TritonOpBuilder &self, std::vector<int> v) {
+             self.setAsyncTaskIds(v);
+           })
+      .def("unset_async_task_ids",
+           [](TritonOpBuilder &self) { self.unsetAsyncTaskIds(); })
       // Attr
       .def(
           "get_unit_attr",
 
@@ -1,8 +1,13 @@
 #pragma once
 #include "mlir/IR/Builders.h"
 #include "triton/Tools/Sys/GetEnv.hpp"
+#include "llvm/ADT/ArrayRef.h"
 #include <memory>
 
+typedef int AsyncTaskId;
+void setAsyncTaskIds(mlir::Operation *op,
+                     llvm::ArrayRef<AsyncTaskId> asyncTaskIds);
+
 // A custom op builder that keeps track of the last location
 class TritonOpBuilder {
 public:
@@ -62,7 +67,10 @@ class TritonOpBuilder {
 
   template <typename OpTy, typename... Args> OpTy create(Args &&...args) {
     auto loc = getLastLoc();
-    return builder->create<OpTy>(loc, std::forward<Args>(args)...);
+    auto ret = builder->create<OpTy>(loc, std::forward<Args>(args)...);
+    if (asyncTaskIds)
+      ::setAsyncTaskIds(ret, *asyncTaskIds);
+    return ret;
   }
 
   // Overload to create or fold a single result operation.
@@ -82,9 +90,16 @@ class TritonOpBuilder {
     return builder->createOrFold<OpTy>(loc, std::forward<Args>(args)...);
   }
 
+  void setAsyncTaskIds(std::vector<int> taskIds) {
+    this->asyncTaskIds = taskIds;
+  }
+
+  void unsetAsyncTaskIds() { this->asyncTaskIds = std::nullopt; }
+
 private:
   std::unique_ptr<mlir::OpBuilder> builder;
   std::unique_ptr<mlir::Location> lastLoc;
+  std::optional<std::vector<int>> asyncTaskIds;
   bool lineInfoEnabled =
       !mlir::triton::tools::getBoolEnv("TRITON_DISABLE_LINE_INFO");
 };
@@ -926,6 +926,20 @@ def _verify_loop_carried_variable(self, name, loop_val, live_val):
             f'but is re-assigned to {loop_val.type} in loop! '\
             f'Please make sure that the type stays consistent.'
 
+    def visit_withitem(self, node):
+        return self.visit(node.context_expr)
+
+    def visit_With(self, node):
+        assert len(node.items) == 1
+        context = node.items[0].context_expr
+        withitemClass = self.visit(context.func)
+        if withitemClass == language.async_task:
+            args = [self.visit(arg) for arg in context.args]
+            with withitemClass(*args, _builder=self.builder):
+                self.visit_compound_statement(node.body)
+        else:
+            self.visit_compound_statement(node.body)
+
     def visit_While(self, node):
         with enter_sub_region(self) as sr:
             liveins, insert_block = sr
 
@@ -39,6 +39,7 @@
     arange,
     associative_scan,
     assume,
+    async_task,
     atomic_add,
     atomic_and,
     atomic_cas,
@@ -145,6 +146,7 @@
     "argmin",
     "associative_scan",
     "assume",
+    "async_task",
     "atomic_add",
     "atomic_and",
     "atomic_cas",
 
@@ -3138,6 +3138,22 @@ def __next__(self):
         raise RuntimeError("static_range can only be used in @triton.jit'd functions")
 
 
+class async_task:
+    """
+    Context manager to run code fragments asynchronously.
+    """
+
+    def __init__(self, task_ids, _builder=None):
+        self.task_ids = list({_unwrap_if_constexpr(tid) for tid in task_ids})
+        self.builder = _builder
+
+    def __enter__(self):
+        self.builder.set_async_task_ids(self.task_ids)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.builder.unset_async_task_ids()
+
+
 class range:
     """
     Iterator that counts upward forever.
 
@@ -47,8 +47,12 @@ def supports_tma():
     return is_cuda() and torch.cuda.get_device_capability()[0] >= 9
 
 
+def is_hopper():
+    return torch.cuda.get_device_capability()[0] == 9
+
+
 def supports_ws():
-    return is_cuda() and torch.cuda.get_device_capability()[0] >= 10
+    return is_cuda() and torch.cuda.get_device_capability()[0] >= 9
 
 
 def _matmul_launch_metadata(grid, kernel, args):
@@ -465,21 +469,31 @@ def grid(META):
     return c
 
 
-@triton.autotune(
-    configs=matmul_tma_persistent_get_configs(),
-    key=["M", "N", "K", "WARP_SPECIALIZE"],
-)
+def prune_invalid_configs(configs, named_args, **kwargs):
+    FLATTEN = kwargs["FLATTEN"]
+    # Filter out configs where EPILOGUE_SUBTILE is true and HOPPER is true
+    return [conf for conf in configs if not (conf.kwargs.get("EPILOGUE_SUBTILE", True) and FLATTEN is False)]
+
+
+@triton.autotune(configs=matmul_tma_persistent_get_configs(), key=["M", "N", "K", "WARP_SPECIALIZE", "FLATTEN"],
+                 prune_configs_by={'early_config_prune': prune_invalid_configs})
 @triton.jit(launch_metadata=_matmul_launch_metadata)
-def matmul_kernel_descriptor_persistent(a_ptr, b_ptr, c_ptr,  #
-                                        M, N, K,  #
-                                        BLOCK_SIZE_M: tl.constexpr,  #
-                                        BLOCK_SIZE_N: tl.constexpr,  #
-                                        BLOCK_SIZE_K: tl.constexpr,  #
-                                        GROUP_SIZE_M: tl.constexpr,  #
-                                        EPILOGUE_SUBTILE: tl.constexpr,  #
-                                        NUM_SMS: tl.constexpr,  #
-                                        WARP_SPECIALIZE: tl.constexpr,  #
-                                        ):
+def matmul_kernel_descriptor_persistent(
+    a_ptr,
+    b_ptr,
+    c_ptr,  #
+    M,
+    N,
+    K,  #
+    BLOCK_SIZE_M: tl.constexpr,  #
+    BLOCK_SIZE_N: tl.constexpr,  #
+    BLOCK_SIZE_K: tl.constexpr,  #
+    GROUP_SIZE_M: tl.constexpr,  #
+    EPILOGUE_SUBTILE: tl.constexpr,  #
+    NUM_SMS: tl.constexpr,  #
+    WARP_SPECIALIZE: tl.constexpr,  #
+    FLATTEN: tl.constexpr,
+):
     # Matmul using TMA and device-side descriptor creation
     dtype = c_ptr.dtype.element_ty
     start_pid = tl.program_id(axis=0)
@@ -512,7 +526,7 @@ def matmul_kernel_descriptor_persistent(a_ptr, b_ptr, c_ptr,  #
     tile_id_c = start_pid - NUM_SMS
     num_pid_in_group = GROUP_SIZE_M * num_pid_n
 
-    for tile_id in tl.range(start_pid, num_tiles, NUM_SMS, flatten=True, warp_specialize=WARP_SPECIALIZE):
+    for tile_id in tl.range(start_pid, num_tiles, NUM_SMS, flatten=FLATTEN, warp_specialize=WARP_SPECIALIZE):
         pid_m, pid_n = _compute_pid(tile_id, num_pid_in_group, num_pid_m, GROUP_SIZE_M, NUM_SMS)
         offs_am = pid_m * BLOCK_SIZE_M
         offs_bn = pid_n * BLOCK_SIZE_N
@@ -560,12 +574,19 @@ def alloc_fn(size: int, alignment: int, stream: Optional[int]):
 
     triton.set_allocator(alloc_fn)
 
+    # Hopper warpspec doesn't work with flatten
+    flatten = False if (warp_specialize and is_hopper()) else True
     grid = lambda META: (min(NUM_SMS, triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"])), )
     matmul_kernel_descriptor_persistent[grid](
-        a, b, c,  #
-        M, N, K,  #
+        a,
+        b,
+        c,  #
+        M,
+        N,
+        K,  #
         NUM_SMS=NUM_SMS,  #
         WARP_SPECIALIZE=warp_specialize,  #
+        FLATTEN=flatten,
     )
     return c
 
@@ -632,7 +653,8 @@ def bench(K, dtype, reps=10000, warmup_reps=10000):
     warp_specialize = [False, True] if HAS_WARP_SPECIALIZE else [False]
     for ws in warp_specialize:
         ws_str = "_ws" if ws else ""
-        if HAS_HOST_TENSOR_DESC:
+        # disable on-host warpspec on Hopper
+        if HAS_HOST_TENSOR_DESC and not (is_hopper() and ws):
             bench_fn(f"tma_persistent{ws_str}", reps, warmup_reps, lambda a, b: matmul_tma_persistent(a, b, ws), a, b)
             bench_fn(f"tma{ws_str}", reps, warmup_reps, lambda a, b: matmul_tma(a, b, ws), a, b)
         if HAS_TENSOR_DESC:
@@ -671,7 +693,9 @@ def validate(M, N, K, dtype):
 
     for (kernel, label, enabled), warp_specialize in itertools.product(kernels, warp_specialize):
         label = f"{label} (warp_specialize={warp_specialize})"
-        enabled = enabled and (not warp_specialize or HAS_TENSOR_DESC)
+        # skip if hopper and warp_specialize and not on-device
+        skipped = is_hopper() and warp_specialize and kernel != matmul_descriptor_persistent
+        enabled = enabled and (not warp_specialize or HAS_TENSOR_DESC) and (not skipped)
         run_test(naive_result, lambda a, b: kernel(a, b, warp_specialize), a, b, label, enabled)
     print()
 
 
@@ -260,3 +260,48 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
     tt.return
   }
 }
+
+
+// -----
+
+// CHECK-DAG: #[[$SHARED:.*]] = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0]}>
+// CHECK-DAG: #[[$SHARED1:.*]]  = #ttg.nvmma_shared<{swizzlingByteWidth = 64, transposed = false, elementBitWidth = 8}>
+// CHECK-LABEL: @_fbgemm_grouped_gemm_fp8_rowwise_ws
+// CHECK: ttg.local_alloc : () -> !ttg.memdesc<1x64x64xf8E4M3FN, #[[$SHARED1]], #smem, mutable>
+// CHECK: ttg.local_alloc : () -> !ttg.memdesc<1x128x64xf8E4M3FN, #[[$SHARED1]], #smem, mutable>
+// CHECK: ttg.local_alloc : () -> !ttg.memdesc<1x128xf32, #[[$SHARED]], #smem, mutable>
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [2, 2], order = [1, 0]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+#mma = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 128, 32]}>
+#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 64, transposed = false, elementBitWidth = 8}>
+#shared1 = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0]}>
+#shared2 = #ttg.nvmma_shared<{swizzlingByteWidth = 64, transposed = true, elementBitWidth = 8}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @_fbgemm_grouped_gemm_fp8_rowwise_ws(%arg0: !tt.ptr<i8, 0> {tt.nv_tma_desc = 1 : i32}, %arg1: i32, %arg2: !tt.ptr<i8, 0> {tt.nv_tma_desc = 1 : i32}, %arg3: !tt.ptr<i8, 0> {tt.nv_tma_desc = 1 : i32}) attributes {noinline = false} {
+    %c0_i32 = arith.constant {async_task_id = array<i32: 0, 1, 2>} 0 : i32
+    %c2048_i32 = arith.constant {async_task_id = array<i32: 0, 1, 2>} 2048 : i32
+    %c64_i32 = arith.constant {async_task_id = array<i32: 0, 1, 2>} 64 : i32
+    %cst = arith.constant {async_task_id = array<i32: 0, 1, 2>} dense<0.000000e+00> : tensor<64x128xf32, #mma>
+    %0 = tt.get_program_id x {async_task_id = array<i32: 0, 1, 2>} : i32
+    %1 = ttng.reinterpret_tensor_descriptor %arg0 {async_task_id = array<i32: 0>} : !tt.ptr<i8, 0> to !tt.tensordesc<tensor<64x64xf8E4M3FN, #shared>>
+    %2 = ttng.reinterpret_tensor_descriptor %arg2 {async_task_id = array<i32: 0>} : !tt.ptr<i8, 0> to !tt.tensordesc<tensor<128x64xf8E4M3FN, #shared>>
+    %3 = ttng.reinterpret_tensor_descriptor %arg3 {async_task_id = array<i32: 0>} : !tt.ptr<i8, 0> to !tt.tensordesc<tensor<128xf32, #shared1>>
+    scf.for %arg4 = %0 to %arg1 step %c64_i32  : i32 {
+      %4 = arith.muli %arg4, %c2048_i32 {async_task_id = array<i32: 0>} : i32
+      %5 = scf.for %arg5 = %c0_i32 to %c2048_i32 step %c64_i32 iter_args(%arg6 = %cst) -> (tensor<64x128xf32, #mma>)  : i32 {
+        %8 = tt.descriptor_load %1[%4, %arg5] {async_task_id = array<i32: 0>} : !tt.tensordesc<tensor<64x64xf8E4M3FN, #shared>> -> tensor<64x64xf8E4M3FN, #blocked>
+        %9 = ttg.local_alloc %8 {async_task_id = array<i32: 1>} : (tensor<64x64xf8E4M3FN, #blocked>) -> !ttg.memdesc<64x64xf8E4M3FN, #shared, #smem>
+        %10 = tt.descriptor_load %2[%4, %arg5] {async_task_id = array<i32: 0>} : !tt.tensordesc<tensor<128x64xf8E4M3FN, #shared>> -> tensor<128x64xf8E4M3FN, #blocked>
+        %11 = ttg.local_alloc %10 {async_task_id = array<i32: 1, 2>} : (tensor<128x64xf8E4M3FN, #blocked>) -> !ttg.memdesc<128x64xf8E4M3FN, #shared, #smem>
+        %12 = ttg.memdesc_trans %11 {async_task_id = array<i32: 1, 2>, order = array<i32: 1, 0>} : !ttg.memdesc<128x64xf8E4M3FN, #shared, #smem> -> !ttg.memdesc<64x128xf8E4M3FN, #shared2, #smem>
+        %13 = ttng.warp_group_dot %9, %12, %arg6 {async_task_id = array<i32: 1>, inputPrecision = 0 : i32, maxNumImpreciseAcc = 1073741824 : i32} : !ttg.memdesc<64x64xf8E4M3FN, #shared, #smem> * !ttg.memdesc<64x128xf8E4M3FN, #shared2, #smem> -> tensor<64x128xf32, #mma>
+        scf.yield {async_task_id = array<i32: 1, 2>} %13 : tensor<64x128xf32, #mma>
+      } {async_task_id = array<i32: 0, 1, 2>}
+      %6 = tt.descriptor_load %3[%4] {async_task_id = array<i32: 0>} : !tt.tensordesc<tensor<128xf32, #shared1>> -> tensor<128xf32, #blocked1>
+      %7 = ttg.convert_layout %6 {async_task_id = array<i32: 1, 2>} : tensor<128xf32, #blocked1> -> tensor<128xf32, #ttg.slice<{dim = 0, parent = #blocked}>>
+    } {async_task_id = array<i32: 1, 2>}
+    tt.return
+  }
+}
@@ -259,6 +259,7 @@ def make_ttgir(mod, metadata, opt, capability):
             passes.ttir.add_triton_licm(pm)
             passes.common.add_canonicalizer(pm)
             passes.ttgpuir.add_combine_tensor_select_and_if(pm)
+            nvidia.passes.hopper.add_hopper_warpspec(pm, opt.num_stages, dump_enabled)
             passes.ttgpuir.add_assign_latencies(pm, opt.num_stages)
             passes.ttgpuir.add_schedule_loops(pm)
             passes.ttgpuir.add_pipeline(pm, opt.num_stages, dump_enabled)
 
@@ -14,9 +14,12 @@ def NVGPUWarpSpecialization : Pass<"nvgpu-warp-specialization", "mlir::ModuleOp"
 
   let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect"];
   let options = [
-    Option<"numWarpGroups", "num-warp-groups",
+    Option<"numStages", "num-stages",
            "int32_t", /*default*/"0",
-           "number of warp groups for warp specialization">
+           "number of buffers for warp specialization">,
+    Option<"dumpIntermediateSteps", "dump-intermediate-steps",
+           "bool", /*default*/"false",
+           "Dump intermediate steps">
   ];
 }