zwu-2025
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Partition.h‎
Lines changed: 0 additions & 5 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Partition.h‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h‎
Lines changed: 4 additions & 34 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h‎
Lines changed: 4 additions & 34 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h‎
Lines changed: 6 additions & 31 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h‎
Lines changed: 6 additions & 31 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/WarpSpecialization.h‎
Lines changed: 0 additions & 3 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/WarpSpecialization.h‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/Transforms/TMAUtilities.h‎
Lines changed: 5 additions & 101 deletions b/‎include/triton/Dialect/TritonNvidiaGPU/Transforms/TMAUtilities.h‎
Lines changed: 5 additions & 101 deletions
@@ -35,9 +35,6 @@ class Partition {
   int getStage() const { return stage; }
   ArrayRef<Operation *> getOps() const { return ops; }
 
-  void insert(Operation *op) { ops.push_back(op); }
-  void remove(Operation *op) { ops.erase(llvm::find(ops, op)); }
-
 private:
   void setIndex(int idx) { this->idx = idx; }
   friend class WarpSchedule;
@@ -59,8 +56,6 @@ class WarpSchedule {
 public:
   // Create a new partition with a stage.
   Partition *addPartition(unsigned stage);
-  // Update the op to partition mapping.
-  void updatePartitions();
 
   // Get the partition the op belongs to.
   Partition *getPartition(Operation *op);
 
@@ -131,42 +131,12 @@ int getNumStagesOrDefault(scf::ForOp forOp, int defaultNumStages);
 
 // Given a result of MemDescSubview, or Alloca, create a MemDescSubview with a
 // single buffer slice (leading dimension equal to 1), at the given index.
-template <typename TBuilder>
 TypedValue<triton::gpu::MemDescType>
-createSingleBufferView(TBuilder &builder, Value alloc, Value idx) {
-  assert(isa<triton::gpu::MemDescType>(alloc.getType()) &&
-         "Expected MemDescType");
-  auto allocDescType = cast<triton::gpu::MemDescType>(alloc.getType());
-  SmallVector<int64_t> shape;
-  if (allocDescType.getShape().size() > 1) {
-    shape.insert(shape.end(), allocDescType.getShape().begin() + 1,
-                 allocDescType.getShape().end());
-  } else {
-    shape.push_back(1);
-  }
-  auto viewDescType = triton::gpu::MemDescType::get(
-      shape, allocDescType.getElementType(), allocDescType.getEncoding(),
-      allocDescType.getMemorySpace(), allocDescType.getMutableMemory(),
-      /*allocShape=*/allocDescType.getAllocShape());
-  SmallVector<Value> idxs = {idx};
-  if (allocDescType.getShape().size() > 1) {
-    Value zero =
-        builder.template create<arith::ConstantIntOp>(alloc.getLoc(), 0, 32);
-    for (unsigned i = 1; i < allocDescType.getShape().size(); i++) {
-      idxs.push_back(zero);
-    }
-  }
-  return builder.template create<triton::gpu::MemDescSubviewOp>(
-      alloc.getLoc(), viewDescType, alloc, idxs);
-}
-
-template <typename TBuilder>
+createSingleBufferView(OpBuilder &builder, Value alloc, Value idx);
+// Given a result of MemDescSubview, or Alloca, create a MemDescSubview with a
+// single buffer slice (leading dimension equal to 1), at the given index.
 TypedValue<triton::gpu::MemDescType>
-createSingleBufferView(TBuilder &builder, Value alloc, int idx) {
-  return createSingleBufferView(
-      builder, alloc,
-      builder.template create<arith::ConstantIntOp>(alloc.getLoc(), idx, 32));
-}
+createSingleBufferView(OpBuilder &builder, Value alloc, int idx);
 
 } // namespace triton
 } // namespace mlir
 
@@ -250,44 +250,19 @@ void eraseLoopCarriedValues(scf::ForOp &loop, llvm::BitVector indices);
 } // namespace mlir
 
 namespace mlir::triton {
-
 /// Replace all uses of `oldUse` with `val` and propagate the type if needed.
 /// This is useful when we need to change a memory descriptor from immutable to
 /// mutable.
 void replaceUsesAndPropagateType(OpBuilder &builder, Operation *oldUse,
                                  Value val);
 
-template <typename BuilderT>
+/// Replace all uses of `old` with a local load from `alloc` unless the use is a
+/// `ttg.local_alloc` with a matching shared encoding, in which case the shared
+/// memory is forwarded directly into the use.
 void replaceUsesWithLocalLoad(
-    BuilderT &builder, OpResult old, TypedValue<triton::gpu::MemDescType> alloc,
-    TypedValue<triton::gpu::AsyncTokenType> token = {}) {
-  //  Remove redundant local_load -> local_alloc
-  namespace ttg = triton::gpu;
-  using triton::gpu::LocalAllocOp;
-  auto allocTy = alloc.getType();
-  SmallVector<LocalAllocOp> allocsToErase;
-  for (Operation *user : old.getUsers()) {
-    if (auto userAlloc = dyn_cast<LocalAllocOp>(user)) {
-      if (allocTy.getEncoding() == userAlloc.getType().getEncoding()) {
-        replaceUsesAndPropagateType(builder, userAlloc, alloc);
-        allocsToErase.push_back(userAlloc);
-      }
-    }
-  }
-
-  // If there are some uses that were not local_allocs, we need to create a
-  // local_load for them.
-  if (std::distance(old.getUsers().begin(), old.getUsers().end()) >
-      allocsToErase.size()) {
-    auto loc = old.getOwner()->getLoc();
-    auto sharedLoad = builder.template create<ttg::LocalLoadOp>(
-        loc, old.getType(), alloc, token);
-    old.replaceAllUsesWith(sharedLoad.getResult());
-  }
-  for (auto alloc : allocsToErase) {
-    alloc.erase();
-  }
-}
+    OpBuilder &builder, OpResult old,
+    TypedValue<triton::gpu::MemDescType> alloc,
+    TypedValue<triton::gpu::AsyncTokenType> token = {});
 } // namespace mlir::triton
 
 #endif // TRITON_DIALECT_TRITONGPU_TRANSFORMS_UTILITY_H_
@@ -8,9 +8,6 @@ namespace scf {
 class ForOp;
 } // namespace scf
 namespace triton::gpu {
-// Identify load-mma dependencies and specialize them to different partitions.
-LogicalResult specializeLoadMMADependencies(scf::ForOp &loop,
-                                            int defaultNumStages);
 // This is the final step to prepare a loop for warp specialization. This takes
 // a loop with a partition schedule and rewrites the loop such that all SSA
 // dependencies between partitions are passed through shared memory and
 
@@ -16,17 +16,9 @@ inline bool isFp4Padded(Attribute encoding) {
   return mmaEnc && mmaEnc.getFp4Padded();
 }
 
-template <typename BuilderT>
-inline SmallVector<Value> translateTMAIndices(BuilderT &builder, Location loc,
-                                              Attribute encoding,
-                                              SmallVector<Value> indices) {
-  if (isFp4Padded(encoding)) {
-    auto two = builder.template create<arith::ConstantIntOp>(loc, 2, 32);
-    indices.back() =
-        builder.template create<arith::MulIOp>(loc, indices.back(), two);
-  }
-  return indices;
-}
+SmallVector<Value> translateTMAIndices(OpBuilder &builder, Location loc,
+                                       Attribute encoding,
+                                       SmallVector<Value> indices);
 
 gpu::CTALayoutAttr updateCTALayoutForShape(gpu::CTALayoutAttr ctaLayout,
                                            ArrayRef<int64_t> shape);
@@ -69,95 +61,7 @@ std::optional<int> getTMASwizzleMode(Operation *op, TensorDescType ty);
 
 std::optional<int> getTMAElementType(Operation *op, TensorDescType ty);
 
-template <typename BuilderT>
-mlir::LogicalResult createTMADesc(mlir::Value tmaPtr,
-                                  mlir::triton::MakeTensorDescOp op,
-                                  BuilderT &builder) {
-  using namespace mlir;
-  MLIRContext *ctx = op.getContext();
-  auto loc = op.getLoc();
-  auto mkI32Constant = [&](int32_t val) {
-    return builder.template create<arith::ConstantOp>(
-        loc, builder.getI32Type(), builder.getI32IntegerAttr(val));
-  };
-
-  auto elemType = op.getBase().getType().getPointeeType();
-  auto elemSize = elemType.getIntOrFloatBitWidth() / 8;
-  auto encoding = op.getType().getBlockType().getEncoding();
-  auto mmaEncoding =
-      llvm::dyn_cast_or_null<gpu::NVMMASharedEncodingAttr>(encoding);
-  bool fp4Padded = mmaEncoding && mmaEncoding.getFp4Padded();
-
-  int paddingScale = fp4Padded ? 2 : 1;
-  auto shapePerCTA = gpu::getShapePerCTA(encoding, op.getTensorShape());
-  auto blockShape =
-      getTMABlockShape(encoding, shapePerCTA, /*packedSize=*/false);
-  auto contigDimSize = blockShape.back();
-
-  llvm::SmallVector<Value> boxDim;
-  if (fp4Padded && contigDimSize != 128) {
-    return op->emitError(
-        "FP4 padded loads require 128 elements or more in the last dim");
-  }
-  boxDim.push_back(mkI32Constant(contigDimSize));
-  for (int k = shapePerCTA.size() - 2; k >= 0; --k)
-    boxDim.push_back(mkI32Constant(blockShape[k]));
-
-  unsigned swizzleBytes = mmaEncoding ? mmaEncoding.getSwizzlingByteWidth() : 0;
-  if (!mmaEncoding) {
-    auto swizzledEnc = dyn_cast<gpu::SwizzledSharedEncodingAttr>(
-        op.getType().getBlockType().getEncoding());
-    if (!swizzledEnc || swizzledEnc.getVec() != 1 ||
-        swizzledEnc.getPerPhase() != 1 || swizzledEnc.getMaxPhase() != 1) {
-      op->emitError() << "Unhandled encoding type";
-      return failure();
-    }
-  }
-
-  auto maybeSwizzleMode = getTMASwizzleMode(op, op.getType());
-  if (!maybeSwizzleMode)
-    return failure();
-  auto swizzleMode = *maybeSwizzleMode;
-
-  Value elemSizeVal = builder.template create<arith::ConstantOp>(
-      loc, builder.getI64Type(), builder.getI64IntegerAttr(elemSize));
-
-  SmallVector<Value> globalDim(llvm::reverse(op.getShape()));
-  SmallVector<Value> globalStride;
-  for (int k = op.getStrides().size() - 2; k >= 0; --k) {
-    globalStride.push_back(op.getStrides()[k]);
-  }
-
-  if (fp4Padded) {
-    // Convert number of bytes to number of mxfp4 elements
-    globalDim[0] = builder.template create<arith::MulIOp>(loc, globalDim[0],
-                                                          mkI32Constant(2));
-  }
-
-  SmallVector<Value> elementStride(globalDim.size(), mkI32Constant(1));
-
-  for (int i = 0; i < globalStride.size(); ++i)
-    globalStride[i] = builder.template create<arith::MulIOp>(
-        loc, globalStride[i], elemSizeVal);
-
-  auto elemTypeEnum = getTMAElementType(op, op.getType());
-  if (!elemTypeEnum) {
-    return failure();
-  }
-
-  builder.template create<triton::ExperimentalTensormapCreateOp>(
-      loc,
-      /*desc_ptr=*/tmaPtr,
-      /*global_address=*/op.getBase(),
-      /*box_dim=*/boxDim,
-      /*global_dim=*/globalDim,
-      /*global_stride=*/globalStride,
-      /*element_strides=*/elementStride,
-      /*elem_type*/ builder.getI32IntegerAttr(*elemTypeEnum),
-      /*interleave_layout*/ builder.getI32IntegerAttr(0),
-      /*swizzle_mode=*/builder.getI32IntegerAttr(swizzleMode),
-      /*fill_mode=*/builder.getI32IntegerAttr(0));
-  return success();
-}
+LogicalResult createTMADesc(Value tmaPtr, MakeTensorDescOp op,
+                            OpBuilder &builder);
 
 } // namespace mlir::triton::nvidia_gpu