[Hopper][WS] Update pipeline to get GEMM/FA working (#7136)

manman-ren · web-flow · commit 1f126370ff3e · 2025-06-23T11:50:58.000-07:00
Builds and runs for GEMM with
matmul_kernel_persistent_tma_ws_cooperative (i.e 2 consumer groups doing
computation + epilogue, one producer group doing loads).
Performance will be tuned in a followup diff.

We set num_stages to 0 after WarpSpec to disable SWP. Also SWP is
updated to bail out if there is no loop with num_stages &gt;= 1.
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/SoftwarePipeliner.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/SoftwarePipeliner.cpp
@@ -33,12 +33,13 @@ namespace gpu {
 #define GEN_PASS_DEF_TRITONGPUPIPELINE
 #include "triton/Dialect/TritonGPU/Transforms/Passes.h.inc"
 
-static void pipelineWgmma(ModuleOp moduleOp) {
+static void pipelineWgmma(ModuleOp moduleOp, unsigned numStages) {
   SmallVector<scf::ForOp> loops;
   moduleOp->walk([&](scf::ForOp forOp) { loops.push_back(forOp); });
 
   for (scf::ForOp forOp : loops) {
-    mlir::triton::asyncLaunchDots(forOp);
+    if (getNumStagesOrDefault(forOp, numStages) >= 1)
+      mlir::triton::asyncLaunchDots(forOp);
   }
 }
 
@@ -223,7 +224,6 @@ struct PipelinePass : public impl::TritonGPUPipelineBase<PipelinePass> {
 
   void runOnOperation() override {
     ModuleOp moduleOp = getOperation();
-
     // Transform the loop by introducing async operations to prepare it for
     // pipeline expansion.
     lowerLoops(moduleOp);
@@ -244,7 +244,7 @@ struct PipelinePass : public impl::TritonGPUPipelineBase<PipelinePass> {
     // Cleanup the IR from the pipeline attributes.
     removeAttributes(moduleOp);
 
-    pipelineWgmma(moduleOp);
+    pipelineWgmma(moduleOp, numStages);
 
     // schedule the waits
     mlir::triton::updateWaits(getOperation());
diff --git a/python/tutorials/09-persistent-matmul.py b/python/tutorials/09-persistent-matmul.py
@@ -47,8 +47,12 @@ def supports_tma():
     return is_cuda() and torch.cuda.get_device_capability()[0] >= 9
 
 
+def is_hopper():
+    return torch.cuda.get_device_capability()[0] == 9
+
+
 def supports_ws():
-    return is_cuda() and torch.cuda.get_device_capability()[0] >= 10
+    return is_cuda() and torch.cuda.get_device_capability()[0] >= 9
 
 
 def _matmul_launch_metadata(grid, kernel, args):
@@ -465,21 +469,31 @@ def grid(META):
     return c
 
 
-@triton.autotune(
-    configs=matmul_tma_persistent_get_configs(),
-    key=["M", "N", "K", "WARP_SPECIALIZE"],
-)
+def prune_invalid_configs(configs, named_args, **kwargs):
+    FLATTEN = kwargs["FLATTEN"]
+    # Filter out configs where EPILOGUE_SUBTILE is true and HOPPER is true
+    return [conf for conf in configs if not (conf.kwargs.get("EPILOGUE_SUBTILE", True) and FLATTEN is False)]
+
+
+@triton.autotune(configs=matmul_tma_persistent_get_configs(), key=["M", "N", "K", "WARP_SPECIALIZE", "FLATTEN"],
+                 prune_configs_by={'early_config_prune': prune_invalid_configs})
 @triton.jit(launch_metadata=_matmul_launch_metadata)
-def matmul_kernel_descriptor_persistent(a_ptr, b_ptr, c_ptr,  #
-                                        M, N, K,  #
-                                        BLOCK_SIZE_M: tl.constexpr,  #
-                                        BLOCK_SIZE_N: tl.constexpr,  #
-                                        BLOCK_SIZE_K: tl.constexpr,  #
-                                        GROUP_SIZE_M: tl.constexpr,  #
-                                        EPILOGUE_SUBTILE: tl.constexpr,  #
-                                        NUM_SMS: tl.constexpr,  #
-                                        WARP_SPECIALIZE: tl.constexpr,  #
-                                        ):
+def matmul_kernel_descriptor_persistent(
+    a_ptr,
+    b_ptr,
+    c_ptr,  #
+    M,
+    N,
+    K,  #
+    BLOCK_SIZE_M: tl.constexpr,  #
+    BLOCK_SIZE_N: tl.constexpr,  #
+    BLOCK_SIZE_K: tl.constexpr,  #
+    GROUP_SIZE_M: tl.constexpr,  #
+    EPILOGUE_SUBTILE: tl.constexpr,  #
+    NUM_SMS: tl.constexpr,  #
+    WARP_SPECIALIZE: tl.constexpr,  #
+    FLATTEN: tl.constexpr,
+):
     # Matmul using TMA and device-side descriptor creation
     dtype = c_ptr.dtype.element_ty
     start_pid = tl.program_id(axis=0)
@@ -512,7 +526,7 @@ def matmul_kernel_descriptor_persistent(a_ptr, b_ptr, c_ptr,  #
     tile_id_c = start_pid - NUM_SMS
     num_pid_in_group = GROUP_SIZE_M * num_pid_n
 
-    for tile_id in tl.range(start_pid, num_tiles, NUM_SMS, flatten=True, warp_specialize=WARP_SPECIALIZE):
+    for tile_id in tl.range(start_pid, num_tiles, NUM_SMS, flatten=FLATTEN, warp_specialize=WARP_SPECIALIZE):
         pid_m, pid_n = _compute_pid(tile_id, num_pid_in_group, num_pid_m, GROUP_SIZE_M, NUM_SMS)
         offs_am = pid_m * BLOCK_SIZE_M
         offs_bn = pid_n * BLOCK_SIZE_N
@@ -560,12 +574,19 @@ def alloc_fn(size: int, alignment: int, stream: Optional[int]):
 
     triton.set_allocator(alloc_fn)
 
+    # Hopper warpspec doesn't work with flatten
+    flatten = False if (warp_specialize and is_hopper()) else True
     grid = lambda META: (min(NUM_SMS, triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"])), )
     matmul_kernel_descriptor_persistent[grid](
-        a, b, c,  #
-        M, N, K,  #
+        a,
+        b,
+        c,  #
+        M,
+        N,
+        K,  #
         NUM_SMS=NUM_SMS,  #
         WARP_SPECIALIZE=warp_specialize,  #
+        FLATTEN=flatten,
     )
     return c
 
@@ -632,7 +653,8 @@ def bench(K, dtype, reps=10000, warmup_reps=10000):
     warp_specialize = [False, True] if HAS_WARP_SPECIALIZE else [False]
     for ws in warp_specialize:
         ws_str = "_ws" if ws else ""
-        if HAS_HOST_TENSOR_DESC:
+        # disable on-host warpspec on Hopper
+        if HAS_HOST_TENSOR_DESC and not (is_hopper() and ws):
             bench_fn(f"tma_persistent{ws_str}", reps, warmup_reps, lambda a, b: matmul_tma_persistent(a, b, ws), a, b)
             bench_fn(f"tma{ws_str}", reps, warmup_reps, lambda a, b: matmul_tma(a, b, ws), a, b)
         if HAS_TENSOR_DESC:
@@ -671,7 +693,9 @@ def validate(M, N, K, dtype):
 
     for (kernel, label, enabled), warp_specialize in itertools.product(kernels, warp_specialize):
         label = f"{label} (warp_specialize={warp_specialize})"
-        enabled = enabled and (not warp_specialize or HAS_TENSOR_DESC)
+        # skip if hopper and warp_specialize and not on-device
+        skipped = is_hopper() and warp_specialize and kernel != matmul_descriptor_persistent
+        enabled = enabled and (not warp_specialize or HAS_TENSOR_DESC) and (not skipped)
         run_test(naive_result, lambda a, b: kernel(a, b, warp_specialize), a, b, label, enabled)
     print()
 
diff --git a/third_party/nvidia/backend/compiler.py b/third_party/nvidia/backend/compiler.py
@@ -260,6 +260,7 @@ def make_ttgir(mod, metadata, opt, capability):
             passes.ttir.add_triton_licm(pm)
             passes.common.add_canonicalizer(pm)
             passes.ttgpuir.add_combine_tensor_select_and_if(pm)
+            nvidia.passes.hopper.add_hopper_warpspec(pm, opt.num_stages, dump_enabled)
             passes.ttgpuir.add_assign_latencies(pm, opt.num_stages)
             passes.ttgpuir.add_schedule_loops(pm)
             passes.ttgpuir.add_pipeline(pm, opt.num_stages, dump_enabled)
diff --git a/third_party/nvidia/hopper/include/Transforms/Passes.td b/third_party/nvidia/hopper/include/Transforms/Passes.td
@@ -14,9 +14,12 @@ def NVGPUWarpSpecialization : Pass<"nvgpu-warp-specialization", "mlir::ModuleOp"
 
   let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect"];
   let options = [
-    Option<"numWarpGroups", "num-warp-groups",
+    Option<"numStages", "num-stages",
            "int32_t", /*default*/"0",
-           "number of warp groups for warp specialization">
+           "number of buffers for warp specialization">,
+    Option<"dumpIntermediateSteps", "dump-intermediate-steps",
+           "bool", /*default*/"false",
+           "Dump intermediate steps">
   ];
 }
 
diff --git a/third_party/nvidia/hopper/lib/Transforms/CMakeLists.txt b/third_party/nvidia/hopper/lib/Transforms/CMakeLists.txt
@@ -7,6 +7,7 @@ add_triton_library(NVHopperTransforms
   WarpSpecialization/WSCodePartition.cpp
   WarpSpecialization/WSDataPartition.cpp
   WarpSpecialization/WSLowerMem.cpp
+  WarpSpecialization/WSLowerToken.cpp
   WarpSpecialization/WSSpecialize.cpp
   WarpSpecialization/WSTaskIdPropagate.cpp
   WarpSpecialization/WSTaskPartition.cpp
diff --git a/third_party/nvidia/hopper/lib/Transforms/WarpSpecialization.cpp b/third_party/nvidia/hopper/lib/Transforms/WarpSpecialization.cpp
@@ -3,6 +3,7 @@
 #include "mlir/Transforms/Passes.h"
 #include "nvidia/hopper/include/Transforms/Passes.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
+#include "triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h"
 
 #define DEBUG_TYPE "nvgpu-warp-specialization"
 #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
@@ -11,7 +12,10 @@
 namespace mlir {
 
 void doTaskPartition(triton::FuncOp &funcOp, unsigned numWarpGroups);
+int doTaskIdPropagate(triton::FuncOp &funcOp);
 bool doDataPartition(triton::FuncOp &funcOp, unsigned numConsumerGroups);
+void doCodePartition(triton::FuncOp &funcOp, unsigned numBuffers);
+void doTokenLowering(triton::FuncOp &funcOp, unsigned numConsumerGroups);
 
 #define GEN_PASS_DEF_NVGPUWARPSPECIALIZATION
 #include "nvidia/hopper/include/Transforms/Passes.h.inc"
@@ -23,15 +27,81 @@ class NVGPUWarpSpecializationPass
       NVGPUWarpSpecializationPass>::NVGPUWarpSpecializationBase;
 
   void runOnFuncOp(triton::FuncOp funcOp) {
-    if (numWarpGroups <= 1)
+    SmallVector<scf::ForOp> loops;
+    funcOp->walk([&](scf::ForOp forOp) {
+      if (forOp->hasAttr(mlir::triton::kWarpSpecializeAttrName))
+        loops.push_back(forOp);
+    });
+    if (loops.empty())
       return;
 
-    // Partition key ops into multiple async tasks.
-    doTaskPartition(funcOp, numWarpGroups);
+    int numWarps = mlir::triton::gpu::lookupNumWarps(funcOp);
+    if (numWarps != 4)
+      return;
+
+    // FIXME: skip warpspec if there is else block. Need to improve
+    // CodePartitioning to correctly handle channels in else block.
+    bool hasElse = false;
+    funcOp->walk([&](scf::IfOp ifOp) {
+      if (ifOp.elseBlock()) {
+        for (Operation &op : ifOp.elseBlock()->getOperations()) {
+          hasElse = true;
+        }
+      }
+    });
+    if (hasElse)
+      return;
 
-    // Partition ops into parallel sub ops.
-    if (!doDataPartition(funcOp, numWarpGroups - 1))
+    OpBuilder builder(funcOp);
+    auto moduleOp = funcOp->getParentOfType<ModuleOp>();
+    unsigned numWarpGroups = 3;
+    // FIXME: skip data partitioning with on-host TMA.
+    bool success = false;
+    for (; numWarpGroups >= 2; numWarpGroups--) {
+      // Partition key ops into multiple async tasks.
+      doTaskPartition(funcOp, numWarpGroups);
+      if (dumpIntermediateSteps) {
+        llvm::dbgs()
+            << "// -----// WarpSpec internal IR Dump After: doTaskPartition\n"
+            << moduleOp << "\n\n\n";
+      }
+      // Propagate taskId.
+      int retCode = doTaskIdPropagate(funcOp);
+      if (retCode == -1)
+        continue;
+      if (dumpIntermediateSteps) {
+        llvm::dbgs()
+            << "// -----// WarpSpec internal IR Dump After: doTaskIdPropagate\n"
+            << moduleOp << "\n\n\n";
+      }
+
+      // Partition ops into parallel sub ops.
+      if (doDataPartition(funcOp, numWarpGroups - 1)) {
+        if (dumpIntermediateSteps) {
+          llvm::dbgs()
+              << "// -----// WarpSpec internal IR Dump After: doDataPartition\n"
+              << moduleOp << "\n\n\n";
+        }
+        success = true;
+        break;
+      }
+      // Clear async_task.
+    }
+    if (!success)
       signalPassFailure();
+
+    doCodePartition(funcOp, numStages);
+    if (dumpIntermediateSteps) {
+      llvm::dbgs()
+          << "// -----// WarpSpec internal IR Dump After: doCodePartition\n"
+          << moduleOp << "\n\n\n";
+    }
+    doTokenLowering(funcOp, numWarpGroups - 1);
+    // Clear num_stages to disable SWP.
+    funcOp->walk([&](scf::ForOp forOp) {
+      forOp->setAttr(mlir::triton::kNumStagesAttrName,
+                     builder.getI32IntegerAttr(0));
+    });
   }
 
   void runOnOperation() override {
diff --git a/third_party/nvidia/hopper/lib/Transforms/WarpSpecialization/TaskIdPropagation.cpp b/third_party/nvidia/hopper/lib/Transforms/WarpSpecialization/TaskIdPropagation.cpp
@@ -75,6 +75,16 @@ void TaskIdBackwardPropagation::propagateToYield(
   }
 }
 
+void TaskIdBackwardPropagation::propagateToTerminator(
+    Operation *op, ArrayRef<const TaskIdLattice *> &lattices) {
+  for (auto [lattice, terminatorOperand] :
+       llvm::zip_equal(lattices, op->getOperands())) {
+    auto terminatorLattice = getLatticeElement(terminatorOperand);
+    ChangeResult changed = terminatorLattice->meet(lattice->getValue());
+    propagateIfChanged(terminatorLattice, changed);
+  }
+}
+
 void TaskIdBackwardPropagation::propagateToParent(Operation *op,
                                                   const TaskId &taskId) {
   auto parentOp = op->getParentOp();
@@ -93,7 +103,7 @@ void TaskIdBackwardPropagation::propagateToParent(Operation *op,
       ChangeResult changed = condLattice->meet(taskId);
       propagateIfChanged(condLattice, changed);
     } else {
-      if (!isa<triton::FuncOp>(parentOp))
+      if (!isa<triton::FuncOp, triton::ReduceOp>(parentOp))
         llvm_unreachable("Other parent ops are not supported.");
     }
     parentOp = parentOp->getParentOp();
@@ -115,6 +125,14 @@ LogicalResult TaskIdBackwardPropagation::visitOperation(
     }
     // Propagate to the parent ops such as control flows
     propagateToParent(op, annotated);
+
+    if (op->getNumRegions() == 1) {
+      if (auto reduceOp = dyn_cast<triton::ReduceOp>(op)) {
+        propagateToTerminator(reduceOp.getCombineOp().front().getTerminator(),
+                              results);
+      }
+    }
+
     return success();
   }
   // If it is not annotated by the user, propagate from results to the
@@ -129,6 +147,13 @@ LogicalResult TaskIdBackwardPropagation::visitOperation(
   for (const auto resultLattice : results)
     propagateToParent(op, resultLattice->getValue());
 
+  if (op->getNumRegions() == 1) {
+    if (auto reduceOp = dyn_cast<triton::ReduceOp>(op)) {
+      propagateToTerminator(reduceOp.getCombineOp().front().getTerminator(),
+                            results);
+    }
+  }
+
   return success();
 }
 
diff --git a/third_party/nvidia/hopper/lib/Transforms/WarpSpecialization/TaskIdPropagation.h b/third_party/nvidia/hopper/lib/Transforms/WarpSpecialization/TaskIdPropagation.h
@@ -91,6 +91,9 @@ class TaskIdBackwardPropagation
 
   void propagateToYield(scf::YieldOp yieldOp, SmallVector<TaskId> &lattices);
 
+  void propagateToTerminator(Operation *op,
+                             ArrayRef<const TaskIdLattice *> &lattices);
+
   void propagateToParent(Operation *op, const TaskId &taskId);
 };
 
diff --git a/third_party/nvidia/hopper/lib/Transforms/WarpSpecialization/WSCodePartition.cpp b/third_party/nvidia/hopper/lib/Transforms/WarpSpecialization/WSCodePartition.cpp
@@ -1178,8 +1178,7 @@ void foldLocalLoads(triton::FuncOp funcOp) {
                                               kv.getSecond());
 }
 
-void doCodePartition(triton::FuncOp &funcOp, unsigned numBuffers,
-                     unsigned requestedRegisters) {
+void doCodePartition(triton::FuncOp &funcOp, unsigned numBuffers) {
   // Step 1: collect all communications between producers and consumers.
   SmallVector<std::unique_ptr<Channel>> channelsOrigin;
   collectAsyncChannels(channelsOrigin, funcOp, numBuffers);
@@ -1269,7 +1268,7 @@ void doCodePartition(triton::FuncOp &funcOp, unsigned numBuffers,
     funcOp.dump();
   });
 
-  specializeRegion(funcOp, requestedRegisters);
+  specializeRegion(funcOp, 0 /*requestedRegisters*/);
   LLVM_DEBUG({
     LDBG("\n\nwith specializeRegion");
     funcOp.dump();
@@ -1288,7 +1287,7 @@ class NVGPUTestWSCodePartitionPass
   void runOnFuncOp(triton::FuncOp funcOp) {
     // Disable code partitioning when numBuffers is 0.
     if (numBuffers > 0)
-      doCodePartition(funcOp, numBuffers, requestedRegisters);
+      doCodePartition(funcOp, numBuffers);
   }
   void runOnOperation() override {
     getOperation()->walk([&](triton::FuncOp funcOp) { runOnFuncOp(funcOp); });
diff --git a/third_party/nvidia/hopper/lib/Transforms/WarpSpecialization/WSDataPartition.cpp b/third_party/nvidia/hopper/lib/Transforms/WarpSpecialization/WSDataPartition.cpp
@@ -47,6 +47,9 @@ static void fixTaskId(triton::FuncOp &funcOp) {
         auto defTaskIds = getAsyncTaskIds(defOp);
         // Backward propagation: ensure def covers op's task IDs.
         if (!containsAll(defTaskIds, asyncTaskIds)) {
+          // Skip control flow ops.
+          if (isa<scf::YieldOp, scf::ForOp, scf::IfOp>(op))
+            continue;
           // Only propagate backward to arithmetic ops (e.g. constants).
           // Const ops with same value but different task ids can be folded.
           if (defOp->getDialect()->getNamespace() == "arith") {
diff --git a/third_party/nvidia/hopper/lib/Transforms/WarpSpecialization/WSLowerToken.cpp b/third_party/nvidia/hopper/lib/Transforms/WarpSpecialization/WSLowerToken.cpp
diff --git a/third_party/nvidia/hopper/lib/Transforms/WarpSpecialization/WSTaskIdPropagate.cpp b/third_party/nvidia/hopper/lib/Transforms/WarpSpecialization/WSTaskIdPropagate.cpp
diff --git a/third_party/nvidia/triton_nvidia.cc b/third_party/nvidia/triton_nvidia.cc