intel
diff --git a/‎test/Hopper/WarpSpecialization/ws_task_id_propagation.mlir
Lines changed: 55 additions & 0 deletions b/‎test/Hopper/WarpSpecialization/ws_task_id_propagation.mlir
Lines changed: 55 additions & 0 deletions
diff --git a/‎third_party/nvidia/hopper/include/Transforms/Passes.td
Lines changed: 19 additions & 1 deletion b/‎third_party/nvidia/hopper/include/Transforms/Passes.td
Lines changed: 19 additions & 1 deletion
diff --git a/‎third_party/nvidia/hopper/lib/Transforms/CMakeLists.txt
Lines changed: 4 additions & 2 deletions b/‎third_party/nvidia/hopper/lib/Transforms/CMakeLists.txt
Lines changed: 4 additions & 2 deletions
diff --git a/‎third_party/nvidia/hopper/lib/Transforms/WarpSpecialization/TaskIdPropagation.cpp
Lines changed: 173 additions & 0 deletions b/‎third_party/nvidia/hopper/lib/Transforms/WarpSpecialization/TaskIdPropagation.cpp
Lines changed: 173 additions & 0 deletions
diff --git a/‎third_party/nvidia/hopper/lib/Transforms/WarpSpecialization/TaskIdPropagation.h
Lines changed: 99 additions & 0 deletions b/‎third_party/nvidia/hopper/lib/Transforms/WarpSpecialization/TaskIdPropagation.h
Lines changed: 99 additions & 0 deletions
@@ -0,0 +1,55 @@
+// RUN: triton-opt %s -split-input-file --nvgpu-test-taskid-propagate=num-warp-groups=2 | FileCheck %s
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [2, 2], order = [1, 0]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0]}>
+#mma = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 256, 16]}>
+#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 0}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
+
+  // CHECK-LABEL: @matmul_persistent_tma_ws_cooperative_kernel
+  // CHECK:       %[[C0:.*]] = arith.constant {async_task_id = array<i32: 0, 1, 2>} 0 : i32
+  // CHECK-NEXT:  %[[C1:.*]] = arith.constant {async_task_id = array<i32: 0, 1, 2>} 1 : i32
+  // CHECK-NEXT:  %[[C64:.*]] = arith.constant {async_task_id = array<i32: 0>} 64 : i32
+  // CHECK-NEXT:  %[[INIT:.*]] = arith.constant {async_task_id = array<i32: 1, 2>} dense<0.000000e+00> : tensor<128x256xf32, #mma>
+  // CHECK-NEXT:  %[[PID:.*]] = tt.get_program_id x {async_task_id = array<i32: 0, 1, 2>} : i32
+  // CHECK-NEXT:  %[[NUM:.*]] = tt.get_num_programs x {async_task_id = array<i32: 0, 1, 2>} : i32
+  // CHECK-NEXT:  scf.for %[[IV:.*]] = %[[PID]] to %[[UB:.*]] step %[[NUM]]  : i32 {
+  // CHECK-NEXT:    %[[FOR:.*]]:2 = scf.for %{{.*}} = %[[C0]] to %{{.*}} step %[[C1]] iter_args(%[[ACC:.*]] = %[[INIT]], %[[OFF:.*]] = %[[C0]])
+  // CHECK-NEXT:      %[[LOAD1:.*]] = tt.descriptor_load %[[INPUT1:.*]][%[[IV]], %[[OFF]]] {async_task_id = array<i32: 0>}
+  // CHECK-NEXT:      %[[ALLOC1:.*]] = ttg.local_alloc %[[LOAD1]] {async_task_id = array<i32: 1, 2>}
+  // CHECK-NEXT:      %[[LOAD2:.*]] = tt.descriptor_load %[[INPUT2:.*]][%[[OFF]], %[[IV]]] {async_task_id = array<i32: 0>}
+  // CHECK-NEXT:      %[[ALLOC2:.*]] = ttg.local_alloc %[[LOAD2]] {async_task_id = array<i32: 1, 2>}
+  // CHECK-NEXT:      %[[DOT:.*]] = ttng.warp_group_dot %[[ALLOC1]], %[[ALLOC2]], %[[ACC]] {async_task_id = array<i32: 1, 2>, inputPrecision = 0 : i32}
+  // CHECK-NEXT:      %[[ADD:.*]] = arith.addi %[[OFF]], %[[C64]] {async_task_id = array<i32: 0>}
+  // CHECK-NEXT:      scf.yield {async_task_id = array<i32: 0, 1, 2>} %[[DOT]], %[[ADD]]
+  // CHECK-NEXT:    } {async_task_id = array<i32: 0, 1, 2>}
+  // CHECK-NEXT:    arith.truncf %[[FOR]]#0 {async_task_id = array<i32: 1, 2>}
+  // CHECK-NEXT:    ttg.convert_layout %{{.*}} {async_task_id = array<i32: 1, 2>}
+  // CHECK-NEXT:    tt.descriptor_store %[[OUTPUT:.*]][%[[IV]], %[[IV]]], %{{.*}} {async_task_id = array<i32: 1, 2>}
+  // CHECK-NEXT:  } {async_task_id = array<i32: 0, 1, 2>}
+
+  tt.func public @matmul_persistent_tma_ws_cooperative_kernel(%arg0: !tt.tensordesc<tensor<128x64xf16>>, %arg1: !tt.tensordesc<tensor<64x256xf16>>, %arg2: !tt.tensordesc<tensor<128x256xf16>>, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}) attributes {noinline = false} {
+    %c0_i32 = arith.constant 0 : i32
+    %c1_i32 = arith.constant 1 : i32
+    %c64_i32 = arith.constant 64 : i32
+    %cst = arith.constant dense<0.000000e+00> : tensor<128x256xf32, #mma>
+    %0 = tt.get_program_id x : i32
+    %1 = tt.get_num_programs x : i32
+    scf.for %arg6 = %0 to %arg3 step %1  : i32 {
+      %2:2 = scf.for %arg7 = %c0_i32 to %arg5 step %c1_i32 iter_args(%arg8 = %cst, %arg9 = %c0_i32) -> (tensor<128x256xf32, #mma>, i32)  : i32 {
+        %5 = tt.descriptor_load %arg0[%arg6, %arg9] {async_task_id = array<i32: 0>} : !tt.tensordesc<tensor<128x64xf16>> -> tensor<128x64xf16, #blocked>
+        %6 = ttg.local_alloc %5 : (tensor<128x64xf16, #blocked>) -> !ttg.memdesc<128x64xf16, #shared, #smem>
+        %7 = tt.descriptor_load %arg1[%arg9, %arg6] {async_task_id = array<i32: 0>} : !tt.tensordesc<tensor<64x256xf16>> -> tensor<64x256xf16, #blocked1>
+        %8 = ttg.local_alloc %7 : (tensor<64x256xf16, #blocked1>) -> !ttg.memdesc<64x256xf16, #shared, #smem>
+        %9 = ttng.warp_group_dot %6, %8, %arg8 {async_task_id = array<i32: 1, 2>, inputPrecision = 0 : i32} : !ttg.memdesc<128x64xf16, #shared, #smem> * !ttg.memdesc<64x256xf16, #shared, #smem> -> tensor<128x256xf32, #mma>
+        %10 = arith.addi %arg9, %c64_i32 : i32
+        scf.yield %9, %10 : tensor<128x256xf32, #mma>, i32
+      }
+      %3 = arith.truncf %2#0 : tensor<128x256xf32, #mma> to tensor<128x256xf16, #mma>
+      %4 = ttg.convert_layout %3 : tensor<128x256xf16, #mma> -> tensor<128x256xf16, #blocked1>
+      tt.descriptor_store %arg2[%arg6, %arg6], %4 {async_task_id = array<i32: 1, 2>} : !tt.tensordesc<tensor<128x256xf16>>, tensor<128x256xf16, #blocked1>
+    }
+    tt.return
+  }
+}
@@ -4,7 +4,7 @@
 include "mlir/Pass/PassBase.td"
 
 def NVGPUWarpSpecialization : Pass<"nvgpu-warp-specialization", "mlir::ModuleOp"> {
-  let summary = "Automaticl Warp specialization for NVIDIA GPU";
+  let summary = "Automatic Warp specialization for NVIDIA GPU";
 
   let description = [{
     This pass automatically partitions user-defined kernels into
@@ -33,6 +33,24 @@ def NVGPUTestWSTaskPartition : Pass<"nvgpu-test-ws-task-partition", "mlir::Modul
   ];
 }
 
+def NVGPUTestWSTaskIdPropagate : Pass<"nvgpu-test-taskid-propagate", "mlir::ModuleOp"> {
+  let summary = "test warp specialization task id propagation";
+
+  let description = [{
+    This pass propagates the `async_task_id` annotation to the dependencies
+    of any op that has it set.  This has the functional effect of partitioning
+    the graph into multiple async tasks, based on the initial annotation.
+  }];
+
+  let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect"];
+
+  let options = [
+    Option<"numWarpGroups", "num-warp-groups",
+           "int32_t", /*default*/"0",
+           "number of warp groups for warp specialization">
+  ];
+}
+
 def NVGPUTestWSDataPartition : Pass<"nvgpu-test-ws-data-partition", "mlir::ModuleOp"> {
   let summary = "test warp specialization data partition";
 
 
@@ -1,12 +1,14 @@
 add_triton_library(NVHopperTransforms
   WarpSpecialization.cpp
   WarpSpecialization/CodePartitionUtility.cpp
+  WarpSpecialization/TaskIdPropagation.cpp
+  WarpSpecialization/Utility.cpp
   WarpSpecialization/WSBuffer.cpp
   WarpSpecialization/WSCodePartition.cpp
+  WarpSpecialization/WSDataPartition.cpp
   WarpSpecialization/WSLowerMem.cpp
   WarpSpecialization/WSSpecialize.cpp
-  WarpSpecialization/Utility.cpp
-  WarpSpecialization/WSDataPartition.cpp
+  WarpSpecialization/WSTaskIdPropagate.cpp
   WarpSpecialization/WSTaskPartition.cpp
 
   DEPENDS
 
@@ -0,0 +1,173 @@
+#include "TaskIdPropagation.h"
+#include "mlir/Analysis/DataFlow/SparseAnalysis.h"
+#include "mlir/Analysis/DataFlowFramework.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Support/LLVM.h"
+#include "nvidia/hopper/lib/Transforms/WarpSpecialization/Utility.h"
+#include "triton/Dialect/TritonGPU/Transforms/Utility.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "task-id-propagation"
+#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
+
+using namespace mlir;
+using namespace mlir::dataflow;
+
+namespace mlir::triton::gpu {
+
+//===----------------------------------------------------------------------===//
+// TaskId
+//===----------------------------------------------------------------------===//
+
+void TaskId::print(raw_ostream &os) const {
+  if (isUninitialized()) {
+    os << "<UNINITIALIZED>";
+    return;
+  }
+  if (isUnknown()) {
+    os << "<UNKNOWN>";
+    return;
+  }
+  return getTaskIds().print(os);
+}
+
+TaskId TaskId::join(const TaskId &lhs, const TaskId &rhs) {
+  return TaskId::getUnknownTaskId();
+}
+
+TaskId TaskId::meet(const TaskId &lhs, const TaskId &rhs) {
+  if (lhs.isUnknown() || rhs.isUnknown())
+    return TaskId::getUnknownTaskId();
+  if (lhs.isUninitialized())
+    return rhs;
+  if (rhs.isUninitialized())
+    return lhs;
+  if (lhs == rhs)
+    return lhs;
+
+  auto context = lhs.getTaskIds().getContext();
+  auto lhsTasks = lhs.getTaskIds().asArrayRef();
+  auto rhsTasks = rhs.getTaskIds().asArrayRef();
+  // Meet the task ids by merging and deduplicating them
+  SmallVector<AsyncTaskId> result(lhsTasks.begin(), lhsTasks.end());
+  result.insert(result.end(), rhsTasks.begin(), rhsTasks.end());
+  std::sort(result.begin(), result.end());
+  result.erase(std::unique(result.begin(), result.end()), result.end());
+  auto mergedAndDedupedTaskIds =
+      TaskId(DenseI32ArrayAttr::get(context, ArrayRef<AsyncTaskId>(result)));
+  return mergedAndDedupedTaskIds;
+}
+
+//===----------------------------------------------------------------------===//
+// TaskIdBackwardPropagation
+//===----------------------------------------------------------------------===//
+
+void TaskIdBackwardPropagation::propagateToYield(
+    scf::YieldOp yieldOp, SmallVector<TaskId> &lattices) {
+  for (auto [lattice, yieldOperand] :
+       llvm::zip_equal(lattices, yieldOp->getOperands())) {
+    auto yieldLattice = getLatticeElement(yieldOperand);
+    ChangeResult changed = yieldLattice->meet(lattice);
+    propagateIfChanged(yieldLattice, changed);
+  }
+}
+
+void TaskIdBackwardPropagation::propagateToParent(Operation *op,
+                                                  const TaskId &taskId) {
+  auto parentOp = op->getParentOp();
+  while (parentOp && !isa<triton::FuncOp>(parentOp)) {
+    if (auto forOp = dyn_cast<scf::ForOp>(parentOp)) {
+      // Propagate to the control operands of the for op.
+      for (auto controlOperand :
+           forOp.getOperands().take_front(forOp.getNumControlOperands())) {
+        auto controlLattice = getLatticeElement(controlOperand);
+        ChangeResult changed = controlLattice->meet(taskId);
+        propagateIfChanged(controlLattice, changed);
+      }
+    } else if (auto ifOp = dyn_cast<scf::IfOp>(parentOp)) {
+      auto cond = ifOp.getCondition();
+      auto condLattice = getLatticeElement(cond);
+      ChangeResult changed = condLattice->meet(taskId);
+      propagateIfChanged(condLattice, changed);
+    } else {
+      if (!isa<triton::FuncOp>(parentOp))
+        llvm_unreachable("Other parent ops are not supported.");
+    }
+    parentOp = parentOp->getParentOp();
+  }
+}
+
+LogicalResult TaskIdBackwardPropagation::visitOperation(
+    Operation *op, ArrayRef<TaskIdLattice *> operands,
+    ArrayRef<const TaskIdLattice *> results) {
+  // Already annotated
+  // TODO(Arda): Replace the following with getAsyncTaskIds when we no longer
+  // need to dump the task ids into the IR.
+  auto taskIdAttr = op->getAttrOfType<DenseI32ArrayAttr>("async_task_id");
+  if (taskIdAttr) {
+    const auto annotated = TaskId(taskIdAttr);
+    for (auto operandLattice : operands) {
+      ChangeResult changed = operandLattice->meet(annotated);
+      propagateIfChanged(operandLattice, changed);
+    }
+    // Propagate to the parent ops such as control flows
+    propagateToParent(op, annotated);
+    return success();
+  }
+  // If it is not annotated by the user, propagate from results to the
+  // operands
+  for (const auto resultLattice : results) {
+    for (auto operandLattice : operands) {
+      ChangeResult changed = operandLattice->meet(resultLattice->getValue());
+      propagateIfChanged(operandLattice, changed);
+    }
+  }
+
+  for (const auto resultLattice : results)
+    propagateToParent(op, resultLattice->getValue());
+
+  return success();
+}
+
+void TaskIdBackwardPropagation::visitBranchOperand(OpOperand &operand) {
+  auto defOp = operand.getOwner();
+  assert(isa<scf::IfOp>(defOp) || isa<scf::ForOp>(defOp));
+
+  SmallVector<TaskId> lattices(defOp->getNumResults(),
+                               TaskId::getUninitialized());
+  for (auto [i, result] : llvm::enumerate(defOp->getResults())) {
+    auto resultLattice = getLatticeElement(result);
+    // Wait for all the results to be initialized.
+    if (resultLattice->getValue().isUninitialized())
+      return;
+    lattices[i] =
+        resultLattice->getValue().meet(lattices[i], resultLattice->getValue());
+  }
+
+  // Propagate to the yield ops
+  if (auto forOp = dyn_cast<scf::ForOp>(defOp)) {
+    auto yieldOp = cast<scf::YieldOp>(forOp.getBody()->getTerminator());
+    propagateToYield(yieldOp, lattices);
+  } else if (auto ifOp = dyn_cast<scf::IfOp>(defOp)) {
+    propagateToYield(ifOp.thenYield(), lattices);
+    if (!ifOp.getElseRegion().empty())
+      propagateToYield(ifOp.elseYield(), lattices);
+  } else {
+    llvm_unreachable("Unknown branch operation");
+  }
+  return;
+
+  // TODO(Arda): Address what happens when loop is annotated
+}
+
+void TaskIdBackwardPropagation::visitCallOperand(OpOperand &operand) {
+  llvm_unreachable(
+      "Should not have any call operands in the IR after inlining.");
+}
+
+void TaskIdBackwardPropagation::setToExitState(TaskIdLattice *lattice) {}
+
+} // namespace mlir::triton::gpu
@@ -0,0 +1,99 @@
+#ifndef NVHOPPER_ANALYSIS_TASKIDPROPAGATION_H
+#define NVHOPPER_ANALYSIS_TASKIDPROPAGATION_H
+
+#include "mlir/Analysis/DataFlow/SparseAnalysis.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/Support/LLVM.h"
+#include "triton/Dialect/Triton/IR/Dialect.h"
+#include <optional>
+
+using namespace mlir::dataflow;
+
+namespace mlir::triton::gpu {
+
+//===----------------------------------------------------------------------===//
+// TaskId
+//===----------------------------------------------------------------------===//
+
+/// This lattice value represents known information on the async_task_id of a
+/// lattice.
+class TaskId {
+public:
+  /// Construct a taskId value as uninitialized.
+  explicit TaskId() = default;
+
+  /// Construct a taskId value with a known constant.
+  TaskId(DenseI32ArrayAttr taskIds) : taskIds(std::move(taskIds)) {}
+
+  /// Get the constant value. Returns null if no value was determined.
+  DenseI32ArrayAttr getTaskIds() const {
+    assert(!isUninitialized());
+    return *taskIds;
+  }
+
+  /// Compare the taskId values.
+  bool operator==(const TaskId &rhs) const { return taskIds == rhs.taskIds; }
+
+  /// Print the taskId value.
+  void print(raw_ostream &os) const;
+
+  /// The state where the taskIds value is uninitialized. This happens when the
+  /// state hasn't been set during the analysis.
+  static TaskId getUninitialized() { return TaskId{}; }
+
+  /// Whether the state is uninitialized.
+  bool isUninitialized() const { return !taskIds.has_value(); }
+
+  /// Whether the state is unknown.
+  bool isUnknown() const { return taskIds == nullptr; }
+
+  /// The state where the taskId value is unknown.
+  static TaskId getUnknownTaskId() { return TaskId{/*taskIds=*/nullptr}; }
+
+  static TaskId meet(const TaskId &lhs, const TaskId &rhs);
+
+  static TaskId join(const TaskId &lhs, const TaskId &rhs);
+
+private:
+  std::optional<DenseI32ArrayAttr> taskIds;
+};
+
+//===----------------------------------------------------------------------===//
+// TaskIdLattice
+//===----------------------------------------------------------------------===//
+
+class TaskIdLattice : public Lattice<TaskId> {
+public:
+  using Lattice::Lattice;
+};
+
+//===----------------------------------------------------------------------===//
+// TaskIdBackwardPropagation
+//===----------------------------------------------------------------------===//
+
+/// This analysis implements sparse backward propagation, which attempts to
+/// determine the async_task_id of an SSA value.
+
+class TaskIdBackwardPropagation
+    : public SparseBackwardDataFlowAnalysis<TaskIdLattice> {
+public:
+  using SparseBackwardDataFlowAnalysis::SparseBackwardDataFlowAnalysis;
+
+  LogicalResult
+  visitOperation(Operation *op, ArrayRef<TaskIdLattice *> operands,
+                 ArrayRef<const TaskIdLattice *> results) override;
+
+  void visitBranchOperand(OpOperand &operand) override;
+
+  void visitCallOperand(OpOperand &operand) override;
+
+  void setToExitState(TaskIdLattice *lattice) override;
+
+  void propagateToYield(scf::YieldOp yieldOp, SmallVector<TaskId> &lattices);
+
+  void propagateToParent(Operation *op, const TaskId &taskId);
+};
+
+} // namespace mlir::triton::gpu
+
+#endif // NVHOPPER_ANALYSIS_TASKIDPROPAGATION_H