intel
diff --git a/‎test/Hopper/WarpSpecialization/ws_code_partition.mlir
Lines changed: 262 additions & 0 deletions b/‎test/Hopper/WarpSpecialization/ws_code_partition.mlir
Lines changed: 262 additions & 0 deletions
diff --git a/‎third_party/nvidia/hopper/include/Transforms/Passes.td
Lines changed: 19 additions & 0 deletions b/‎third_party/nvidia/hopper/include/Transforms/Passes.td
Lines changed: 19 additions & 0 deletions
diff --git a/‎third_party/nvidia/hopper/lib/Transforms/CMakeLists.txt
Lines changed: 5 additions & 0 deletions b/‎third_party/nvidia/hopper/lib/Transforms/CMakeLists.txt
Lines changed: 5 additions & 0 deletions
diff --git a/‎third_party/nvidia/hopper/lib/Transforms/WarpSpecialization/CodePartitionUtility.cpp
Lines changed: 174 additions & 0 deletions b/‎third_party/nvidia/hopper/lib/Transforms/WarpSpecialization/CodePartitionUtility.cpp
Lines changed: 174 additions & 0 deletions
diff --git a/‎third_party/nvidia/hopper/lib/Transforms/WarpSpecialization/CodePartitionUtility.h
Lines changed: 133 additions & 0 deletions b/‎third_party/nvidia/hopper/lib/Transforms/WarpSpecialization/CodePartitionUtility.h
Lines changed: 133 additions & 0 deletions
diff --git a/‎third_party/nvidia/hopper/lib/Transforms/WarpSpecialization/Utility.h
Lines changed: 0 additions & 1 deletion b/‎third_party/nvidia/hopper/lib/Transforms/WarpSpecialization/Utility.h
Lines changed: 0 additions & 1 deletion
@@ -46,4 +46,23 @@ def NVGPUTestWSDataPartition : Pass<"nvgpu-test-ws-data-partition", "mlir::Modul
   ];
 }
 
+def NVGPUTestWSCodePartition: Pass<"nvgpu-test-ws-code-partition", "mlir::ModuleOp"> {
+  let summary = "test warp specialization code partition";
+
+  let description = "This pass generates warp specialized code baed on task id attributes.";
+
+  let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect",
+                           "mlir::triton::TritonDialect",
+                           "mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect",
+                           "mlir::triton::nvws::NVWSDialect"];
+  let options = [
+    Option<"numBuffers", "num-buffers",
+           "int32_t", /*default*/"0",
+           "number of buffering for producer-consumer">,
+    Option<"numWarpGroups", "num-warp-groups",
+           "int32_t", /*default*/"0",
+           "number of warp groups for warp specialization">
+  ];
+}
+
 #endif // NV_TRANSFORMS_PASSES
@@ -1,5 +1,10 @@
 add_triton_library(NVHopperTransforms
   WarpSpecialization.cpp
+  WarpSpecialization/CodePartitionUtility.cpp
+  WarpSpecialization/WSBuffer.cpp
+  WarpSpecialization/WSCodePartition.cpp
+  WarpSpecialization/WSLowerMem.cpp
+  WarpSpecialization/WSSpecialize.cpp
   WarpSpecialization/Utility.cpp
   WarpSpecialization/WSDataPartition.cpp
   WarpSpecialization/WSTaskPartition.cpp
 
@@ -0,0 +1,174 @@
+#include "CodePartitionUtility.h"
+#include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Transforms/Passes.h"
+#include "nvidia/hopper/include/Transforms/Passes.h"
+#include <list>
+#include <unordered_set>
+
+namespace tt = mlir::triton;
+namespace ttg = mlir::triton::gpu;
+namespace ttng = ::mlir::triton::nvidia_gpu;
+namespace mlir {
+
+#define DEBUG_TYPE "nvgpu-ws-utility"
+#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
+
+// Check to see if op is enclosed under ifOp.
+bool enclosing(scf::IfOp ifOp, Operation *op) {
+  return ifOp->isProperAncestor(op);
+}
+
+bool enclosing(scf::ForOp forOp, Operation *op) {
+  return forOp->isProperAncestor(op);
+}
+
+// Check to see if there is no outer loop that is enclosed under ifOp.
+bool immediateEnclosing(scf::IfOp ifOp, Operation *subOp) {
+  auto pOp = subOp->getParentOfType<scf::ForOp>();
+  if (!pOp)
+    return true;
+  return !enclosing(ifOp, pOp.getOperation());
+}
+
+// Return number of AccumCnts for the given ctrlOp. We need one for each nested
+// region that contains a channel.
+unsigned getAccumCnts(Operation *ctrlOp,
+                      const DenseSet<Operation *> &regionsWithChannels) {
+  unsigned cnt = 0;
+  LDBG("getAccumCnts: " << ctrlOp);
+  for (auto *op : regionsWithChannels) {
+    LDBG("-- getAccumCnts: " << ctrlOp << " regionsWithChannels " << op);
+    if (ctrlOp == op) {
+      ++cnt;
+      continue;
+    }
+    if (auto forOp = dyn_cast<scf::ForOp>(ctrlOp)) {
+      if (enclosing(forOp, op))
+        ++cnt;
+      continue;
+    }
+    if (auto ifOp = dyn_cast<scf::IfOp>(ctrlOp)) {
+      if (enclosing(ifOp, op))
+        ++cnt;
+      continue;
+    }
+    llvm_unreachable("region op other than If/For is not supported");
+  }
+  return cnt;
+}
+
+// Assume parentForOp has accumCnt for the specified ctrlOp.
+unsigned getAccumArgIdx(scf::ForOp parentForOp, Operation *ctrlOp,
+                        const DenseSet<Operation *> &regionsWithChannels) {
+  // Walk parentForOp in preorder.
+  unsigned preOrderId = 0, ctrlId = 0;
+  bool found = false;
+  parentForOp->walk<WalkOrder::PreOrder>([&](Operation *subOp) {
+    // This will walk parentForOp.
+    if (subOp == ctrlOp) {
+      ctrlId = preOrderId;
+      found = true;
+    }
+    for (auto *op : regionsWithChannels) {
+      if (op == subOp) {
+        LDBG("getAccumArgIdx: saw ctrlOp enclosing channel " << subOp);
+        ++preOrderId;
+      }
+    }
+  });
+  assert(found && "error in getAccumArgIdx");
+  LDBG("getAccumArgIdx: " << parentForOp.getOperation() << " " << ctrlOp << " "
+                          << ctrlId);
+  return ctrlId;
+}
+
+// Compute and return the buffer index and phase for a given accumulate count.
+std::pair<Value, Value> getBufferIdxAndPhase(OpBuilderWithAsyncTaskIds &builder,
+                                             Location loc, Value accumCnt,
+                                             unsigned numBuffers) {
+  Value numBuffersVal =
+      builder.createWithAsyncTaskIds<arith::ConstantIntOp>(loc, numBuffers, 32);
+  numBuffersVal = builder.createWithAsyncTaskIds<arith::ExtSIOp>(
+      loc, builder.getI64Type(), numBuffersVal);
+  // Calculate accumCnt / numBuffers
+  // initBufferIdx = accumCnt - accumCnt / numBuffers * numBuffers
+  // initPhase = (accumCnt / numBuffers) & 1
+  Value bufferIdx = builder.createWithAsyncTaskIds<arith::DivUIOp>(
+      loc, accumCnt, numBuffersVal);
+  Value initBufferIdx = builder.createWithAsyncTaskIds<arith::SubIOp>(
+      loc, accumCnt,
+      builder.createWithAsyncTaskIds<arith::MulIOp>(loc, bufferIdx,
+                                                    numBuffersVal));
+  initBufferIdx = builder.createWithAsyncTaskIds<arith::TruncIOp>(
+      loc, builder.getI32Type(), initBufferIdx);
+
+  Value one = builder.createWithAsyncTaskIds<arith::ConstantIntOp>(loc, 1, 64);
+  bufferIdx =
+      builder.createWithAsyncTaskIds<arith::AndIOp>(loc, bufferIdx, one);
+  Value initPhase = builder.createWithAsyncTaskIds<arith::TruncIOp>(
+      loc, builder.getI1Type(), bufferIdx);
+  return {initBufferIdx, initPhase};
+}
+
+// Get the current accumulation count for the given op within its immediate
+// scope.
+// ForA (accumForA, accumIfA, accumForB, accumIfB)
+//   IfA (accumIfA, accumForB)
+//     Channel A --> uses ForA.arg[accumIfA]
+//     ForB (accumForB)
+//       Channel B --> uses ForB.arg[accumForB]
+//   ThenYield ForA.arg[accumIfA] + 1, ForB.res[accumForB]
+//   ElseYield ForA.arg[accumIfA], ForA.arg[accumForB]
+//   ForC (accumForC, accumIfB)
+//     IfB
+//       Channel C --> uses ForC.arg[accumIfB]
+//     ThenYield ForC.arg[accumIfB] + 1
+//     ElseYield ForC.arg[accumIfB]
+//   Channel D --> uses ForA.arg[accumForA]
+Value getAccumCount(OpBuilderWithAsyncTaskIds &builder, Operation *op,
+                    const DenseSet<Operation *> &regionsWithChannels) {
+  auto parentForOp = op->getParentOfType<scf::ForOp>();
+  auto *pOp = op->getParentOp();
+  // Get parentForOp.arg[pOp]
+  unsigned tSize = parentForOp.getBody()->getArguments().size();
+  unsigned parentTCnts = getAccumCnts(parentForOp, regionsWithChannels);
+  unsigned accumArgId = getAccumArgIdx(parentForOp, pOp, regionsWithChannels);
+  Value accumCnt =
+      parentForOp.getBody()->getArgument(tSize - parentTCnts + accumArgId);
+
+  LDBG("getAccumCount: parentForOp " << parentForOp.getOperation() << " pOp "
+                                     << pOp << " " << tSize << " "
+                                     << parentTCnts << " " << accumArgId);
+  return accumCnt;
+}
+
+void getBufferIdxAndPhase(OpBuilderWithAsyncTaskIds &builder, Operation *op,
+                          unsigned numBuffers,
+                          const DenseSet<Operation *> &regionsWithChannels,
+                          Value &bufferIdx, Value &phase) {
+  Value accumCnt = getAccumCount(builder, op, regionsWithChannels);
+  std::tie(bufferIdx, phase) =
+      getBufferIdxAndPhase(builder, op->getLoc(), accumCnt, numBuffers);
+}
+
+Value getBarrierForPipelineStage(OpBuilderWithAsyncTaskIds &builder,
+                                 Value barrierAlloc, Value bufferIdx) {
+  auto context = barrierAlloc.getContext();
+  Attribute sharedMemorySpace =
+      triton::gpu::SharedMemorySpaceAttr::get(context);
+  ttg::MemDescType barrierTy = ttg::MemDescType::get(
+      {1}, builder.getI64Type(),
+      cast<ttg::MemDescType>(barrierAlloc.getType()).getEncoding(),
+      sharedMemorySpace,
+      /*mutableMemory=*/true);
+
+  // Create barrierForTMA from barrierAlloc.
+  return builder.createWithAsyncTaskIds<ttg::MemDescSubviewOp>(
+      barrierAlloc.getLoc(), barrierTy, barrierAlloc,
+      ArrayRef<Value>({bufferIdx}));
+}
+
+} // namespace mlir
@@ -0,0 +1,133 @@
+#ifndef NV_DIALECT_HOPPER_TRANSFORMS_CODEPARTITIONUTILITY_H_
+#define NV_DIALECT_HOPPER_TRANSFORMS_CODEPARTITIONUTILITY_H_
+
+#include "triton/Dialect/Triton/IR/Dialect.h"
+#include "triton/Dialect/TritonGPU/IR/Dialect.h"
+#include "triton/Dialect/TritonGPU/Transforms/Utility.h"
+#include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
+
+#include "Utility.h"
+#include <algorithm>
+#include <numeric>
+
+namespace mlir {
+
+namespace tt = mlir::triton;
+
+enum class DataChannelKind { SMEM, TMEM };
+
+struct Channel {
+public:
+  using Relation = std::pair<int, SmallVector<int>>;
+
+  Channel(int producer, SmallVector<int> &consumers, Operation *op,
+          unsigned operandIdx, unsigned numBuffers)
+      : relation(producer, consumers), op(op), operandIdx(operandIdx),
+        numBuffers(numBuffers) {}
+
+  bool operator==(const Channel &c) {
+    return relation == c.relation && operandIdx == c.operandIdx && op == c.op;
+  }
+
+  Operation *getDstOp() { return op; }
+  unsigned getDstOperandIdx() { return operandIdx; }
+  virtual Value getSrcOperand() { return op->getOperand(operandIdx); }
+  virtual Operation *getSrcOp() { return getSrcOperand().getDefiningOp(); }
+
+  Relation relation; // producer task Id, a list of consumer task Ids
+  Operation *op;
+  unsigned operandIdx;
+  unsigned numBuffers;
+  DataChannelKind channelKind = DataChannelKind::SMEM;
+};
+
+struct CommChannel {
+  DenseMap<int, Value> tokens;
+  // Producer barrier is only needed when the producer op itself can update the
+  // barrier inline, such as the TMA load.
+  std::optional<Value> producerBarrier;
+  // Consumer barrier is only needed when the consumer op itself can update the
+  // barrier inline, such as the TCGen5MMAOp.
+  DenseMap<int, Value> consumerBarriers;
+};
+
+namespace ttng = ::mlir::triton::nvidia_gpu;
+namespace triton {
+namespace nvidia_gpu {
+struct TmemDataChannel : Channel {
+  ttng::TMEMAllocOp tmemAllocOp;
+  ttng::TCGen5MMAOp tmemMmaOp;
+  Operation *tmemProducerOp;
+
+  TmemDataChannel(int producer, SmallVector<int> &consumers,
+                  ttng::TMEMAllocOp tmemAllocOp, ttng::TCGen5MMAOp tmemMmaOp,
+                  Operation *tmemLoadOp, unsigned operandIdx,
+                  unsigned numBuffers)
+      : Channel(producer, consumers, tmemLoadOp, operandIdx, numBuffers),
+        tmemAllocOp(tmemAllocOp), tmemProducerOp(tmemAllocOp),
+        tmemMmaOp(tmemMmaOp) {
+    assert(consumers.size() == 1 &&
+           "TmemDataChannel must have a single consumer");
+    channelKind = DataChannelKind::TMEM;
+  }
+
+  ttng::TMEMAllocOp getAllocOp() { return tmemAllocOp; }
+  ttng::TCGen5MMAOp getMmaOp() { return tmemMmaOp; }
+  virtual Operation *getSrcOp() { return tmemProducerOp; }
+};
+} // namespace nvidia_gpu
+} // namespace triton
+
+bool enclosing(scf::IfOp ifOp, Operation *op);
+bool enclosing(scf::ForOp forOp, Operation *op);
+
+// Return number of AccumCnts for the given ctrlOp. Add a single
+// AccumCnt for all channels under opsWithBufferReuse and it will be the
+// last AccumCnt.
+unsigned getAccumCnts(Operation *ctrlOp,
+                      const DenseSet<Operation *> &regionsWithChannels);
+
+unsigned getAccumArgIdx(scf::ForOp parentForOp, Operation *ctrlOp,
+                        const DenseSet<Operation *> &regionsWithChannels);
+
+SmallVector<Operation *>
+getTaskTopRegion(triton::FuncOp funcOp, const SmallVector<Channel *> &channels);
+
+void appendAccumCntsForOps(SmallVector<Operation *> &taskTopOps,
+                           const SmallVector<Channel *> &channels,
+                           DenseSet<Operation *> &regionsWithChannels);
+
+void collectRegionsWithChannels(const SmallVector<Channel *> &channels,
+                                DenseSet<Operation *> &regionsWithChannels);
+void insertAsyncCopy(
+    triton::FuncOp funcOp,
+    const DenseMap<Channel *, SmallVector<Channel *>>
+        &channelsGroupedByProducers,
+    const DenseMap<Channel *, Value> &bufferMap,
+    DenseMap<Channel *, std::pair<Operation *, Operation *>> &copyOpMap,
+    DenseSet<Operation *> &regionsWithChannels);
+
+Value getAccumCount(OpBuilderWithAsyncTaskIds &builder, Operation *op,
+                    const DenseSet<Operation *> &regionsWithChannels);
+std::pair<Value, Value> getBufferIdxAndPhase(OpBuilderWithAsyncTaskIds &builder,
+                                             Location loc, Value accumCnt,
+                                             unsigned numBuffers);
+void getBufferIdxAndPhase(OpBuilderWithAsyncTaskIds &builder, Operation *op,
+                          unsigned numBuffers,
+                          const DenseSet<Operation *> &regionsWithChannels,
+                          Value &bufferIdx, Value &phase);
+
+Value getBarrierForPipelineStage(OpBuilderWithAsyncTaskIds &builder,
+                                 Value barrierAlloc, Value bufferIdx);
+
+Operation *optimizeTMALoads(OpBuilderWithAsyncTaskIds &builder,
+                            SmallVector<tt::DescriptorLoadOp> &tmaLoads,
+                            SmallVector<Value> &buffers, Value barrierAlloc,
+                            Value bufferIdx, Value bufferIdxExtract,
+                            Value phase, Operation *headProducer,
+                            Operation *headConsumer);
+void specializeRegion(triton::FuncOp funcOp);
+
+} // namespace mlir
+
+#endif // NV_DIALECT_HOPPER_TRANSFORMS_CODEPARTITIONUTILITY_H_
@@ -1,4 +1,3 @@
-
 #ifndef NV_DIALECT_HOPPER_TRANSFORMS_UTILITY_H_
 
 #include "mlir/IR/Builders.h"
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,3 @@`
`1`		`-`
`2`	`1`	`#ifndef NV_DIALECT_HOPPER_TRANSFORMS_UTILITY_H_`
`3`	`2`
`4`	`3`	`#include "mlir/IR/Builders.h"`