intel
diff --git a/‎docs/programming-guide/chapter-3/debugging.rst
Lines changed: 1 addition & 1 deletion b/‎docs/programming-guide/chapter-3/debugging.rst
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/triton/Analysis/Allocation.h
Lines changed: 5 additions & 0 deletions b/‎include/triton/Analysis/Allocation.h
Lines changed: 5 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h
Lines changed: 9 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h
Lines changed: 9 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/IR/Dialect.h
Lines changed: 4 additions & 0 deletions b/‎include/triton/Dialect/TritonNvidiaGPU/IR/Dialect.h
Lines changed: 4 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOpInterfaces.td
Lines changed: 4 additions & 0 deletions b/‎include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOpInterfaces.td
Lines changed: 4 additions & 0 deletions
diff --git a/‎lib/Analysis/Allocation.cpp
Lines changed: 4 additions & 4 deletions b/‎lib/Analysis/Allocation.cpp
Lines changed: 4 additions & 4 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/HoistTMEMAlloc.cpp
Lines changed: 7 additions & 1 deletion b/‎lib/Dialect/TritonGPU/Transforms/HoistTMEMAlloc.cpp
Lines changed: 7 additions & 1 deletion
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp
Lines changed: 65 additions & 0 deletions b/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp
Lines changed: 65 additions & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/SoftwarePipeliner.cpp
Lines changed: 0 additions & 62 deletions b/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/SoftwarePipeliner.cpp
Lines changed: 0 additions & 62 deletions
diff --git a/‎lib/Dialect/TritonNvidiaGPU/IR/Dialect.cpp
Lines changed: 19 additions & 3 deletions b/‎lib/Dialect/TritonNvidiaGPU/IR/Dialect.cpp
Lines changed: 19 additions & 3 deletions
@@ -77,6 +77,6 @@ Using Third-party Tools
 For debugging on NVIDIA GPUs, `compute-sanitizer <https://docs.nvidia.com/cuda/compute-sanitizer/index.html>`_ is an effective tool for checking data races and memory access issues.
 To use it, prepend :code:`compute-sanitizer` to your command to run the Triton program.
 
-For debugging on AMD GPUs, you may want to try the LLVM `AddressSanitizer <https://rocm.docs.amd.com/en/latest/conceptual/using-gpu-sanitizer.html>`_ for ROCm.
+For debugging on AMD GPUs, you may want to try the LLVM `AddressSanitizer <https://rocm.docs.amd.com/projects/llvm-project/en/latest/conceptual/using-gpu-sanitizer.html>`_ for ROCm.
 
 For detailed visualization of memory access in Triton programs, consider using the `triton-viz <https://github.com/Deep-Learning-Profiling-Tools/triton-viz>`_ tool, which is agnostic to the underlying GPUs.
@@ -63,6 +63,11 @@ getScratchCvtInOutVecLengths(RankedTensorType srcTy, RankedTensorType dstTy);
 ScratchConfig getScratchConfigForCvt(RankedTensorType srcTy,
                                      RankedTensorType dstTy);
 
+unsigned getNumScratchElemsSwizzledCvt(RankedTensorType srcTy,
+                                       RankedTensorType dstTy);
+
+unsigned getNumScratchElemsPaddedCvt(RankedTensorType srcTy,
+                                     RankedTensorType dstTy);
 } // namespace triton
 
 /// Modified from llvm-15.0: llvm/ADT/AddressRanges.h
 
@@ -68,6 +68,15 @@ bool isOuterLoop(scf::ForOp forOp);
 /// Function to mask operations during scheduling.
 Operation *predicateOp(RewriterBase &rewriter, Operation *op, Value pred);
 
+/// Wrap the operation into a MaskOp using the provided predicate, enabling high
+/// level predication abstraction during pipelining.
+Operation *wrapInMaskOp(RewriterBase &rewriter, Operation *op, Value pred);
+
+// Utilize high level predication abstraction to perform optimizations before
+// lowering to predicated operations
+void resolveMaskOp(ModuleOp moduleOp,
+                   DenseSet<triton::gpu::MaskOp> &peeledMaskOps);
+
 // Return true if the given ForOp has the attribute
 // `tt.disallow_acc_multi_buffer` set to true.
 bool getDisallowAccMultiBuffer(scf::ForOp forOp);
 
@@ -34,6 +34,10 @@
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h.inc"
 
+namespace mlir::triton::nvidia_gpu::impl {
+LogicalResult verifyMMAv5Op(Operation *op);
+} // namespace mlir::triton::nvidia_gpu::impl
+
 #define GET_ATTRDEF_CLASSES
 #include "triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUAttrDefs.h.inc"
 
 
@@ -54,5 +54,9 @@ def MMAv5OpInterface : OpInterface<"MMAv5OpInterface"> {
                     "setIsAsync",
                     (ins "bool":$isAsync)>,
   ];
+
+  let verify = [{
+    return ::mlir::triton::nvidia_gpu::impl::verifyMMAv5Op($_op);
+  }];
 }
 #endif // TRITON_NVIDIAGPU_OP_INTERFACES
@@ -39,8 +39,8 @@ static unsigned getBitwidth(RankedTensorType ty) {
   return isPtr ? kPtrBitWidth : std::max(ty.getElementTypeBitWidth(), 8u);
 }
 
-static unsigned getNumScratchElemsSwizzledCvt(RankedTensorType srcTy,
-                                              RankedTensorType dstTy) {
+unsigned getNumScratchElemsSwizzledCvt(RankedTensorType srcTy,
+                                       RankedTensorType dstTy) {
   auto *ctx = srcTy.getContext();
   auto srcLayout = gpu::toLinearLayout(srcTy);
   auto dstLayout = gpu::toLinearLayout(dstTy);
@@ -52,8 +52,8 @@ static unsigned getNumScratchElemsSwizzledCvt(RankedTensorType srcTy,
   return smem.getTotalOutDimSize() / reps;
 }
 
-static unsigned getNumScratchElemsPaddedCvt(RankedTensorType srcTy,
-                                            RankedTensorType dstTy) {
+unsigned getNumScratchElemsPaddedCvt(RankedTensorType srcTy,
+                                     RankedTensorType dstTy) {
   auto scratchConfig = getScratchConfigForCvt(srcTy, dstTy);
   return getNumScratchElements(scratchConfig.paddedRepShape);
 }
 
@@ -141,7 +141,13 @@ class SinkTMEMLoad : public OpRewritePattern<ttng::TMEMLoadOp> {
           return postDomInfo.properlyPostDominates(use->getOwner(), domOp);
         }))
       return failure();
-    if (domOp == load->getNextNode()) {
+    // In order to not re-ordering multiple tmem load in a loop, don't sink if
+    // all the ops between the load and the domOp are tmem loads.
+    Operation *nextNode = load->getNextNode();
+    while (auto tmemLoad = dyn_cast<ttng::TMEMLoadOp>(nextNode)) {
+      nextNode = tmemLoad->getNextNode();
+    }
+    if (domOp == nextNode) {
       // The load wasn't moved.
       return failure();
     }
 
@@ -2,10 +2,12 @@
 #include "mlir/Analysis/TopologicalSortUtils.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/UB/IR/UBOps.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "triton/Analysis/AxisInfo.h"
 #include "triton/Dialect/Triton/IR/Utility.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
@@ -279,6 +281,69 @@ Operation *mlir::triton::predicateOp(RewriterBase &rewriter, Operation *op,
   return op;
 }
 
+Operation *mlir::triton::wrapInMaskOp(RewriterBase &rewriter, Operation *op,
+                                      Value pred) {
+  auto mask =
+      rewriter.create<ttg::MaskOp>(op->getLoc(), op->getResultTypes(), pred);
+  rewriter.createBlock(&mask->getRegion(0));
+  rewriter.setInsertionPointToStart(&mask->getRegion(0).front());
+  auto newOp = rewriter.clone(*op);
+  rewriter.create<ttg::MaskReturnOp>(op->getLoc(), newOp->getResults());
+  op->replaceAllUsesWith(mask->getResults());
+  rewriter.eraseOp(op);
+  return mask;
+}
+
+void mlir::triton::resolveMaskOp(ModuleOp moduleOp,
+                                 DenseSet<ttg::MaskOp> &peeledMaskOps) {
+  IRRewriter rewriter(moduleOp);
+
+  // Canonicalize the IR to simplify the arithmetic ops defining the mask
+  auto arithDialect =
+      moduleOp.getContext()->getLoadedDialect<arith::ArithDialect>();
+  RewritePatternSet patterns(moduleOp.getContext());
+  arithDialect->getCanonicalizationPatterns(patterns);
+  if (mlir::applyPatternsGreedily(moduleOp, std::move(patterns)).failed())
+    return llvm::report_fatal_error("Failed to canonicalize the IR");
+
+  // Prune all the statically dead mask ops in the epilogue. This is a
+  // hack, ideally we should do it for all the mask ops, but it is incorrect if
+  // we have speculatively executed async cp operations that will store to shmem
+  // even if the mask is false.
+  for (auto maskOp : peeledMaskOps) {
+    rewriter.setInsertionPoint(maskOp);
+    while (&maskOp.getBody()->front() != maskOp.getBody()->getTerminator()) {
+      Operation *op = &maskOp.getBody()->front();
+      if (isConstantIntValue(maskOp.getPred(), 0)) {
+        if (op->getNumResults() > 0) {
+          SmallVector<Value> results;
+          for (auto result : op->getResults()) {
+            auto poisonOp = rewriter.create<mlir::ub::PoisonOp>(
+                op->getLoc(), result.getType());
+            results.push_back(poisonOp);
+          }
+          op->replaceAllUsesWith(results);
+        }
+        op->erase();
+      }
+    }
+  }
+
+  SmallVector<ttg::MaskOp> maskOps;
+  moduleOp->walk([&](ttg::MaskOp maskOp) { maskOps.push_back(maskOp); });
+  for (auto maskOp : maskOps) {
+    rewriter.setInsertionPoint(maskOp);
+    while (&maskOp.getBody()->front() != maskOp.getBody()->getTerminator()) {
+      Operation *op = &maskOp.getBody()->front();
+      rewriter.moveOpBefore(op, maskOp);
+      op = triton::predicateOp(rewriter, op, maskOp.getPred());
+    }
+    maskOp->replaceAllUsesWith(
+        maskOp.getBody()->getTerminator()->getOperands());
+    maskOp->erase();
+  }
+}
+
 // Return true if the given ForOp has the attribute
 // `tt.disallow_acc_multi_buffer` set to true.
 bool mlir::triton::getDisallowAccMultiBuffer(scf::ForOp forOp) {
 
@@ -1,5 +1,4 @@
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/UB/IR/UBOps.h"
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Support/LLVM.h"
@@ -43,67 +42,6 @@ static void pipelineWgmma(ModuleOp moduleOp, unsigned numStages) {
   }
 }
 
-static Operation *wrapInMaskOp(RewriterBase &rewriter, Operation *op,
-                               Value pred) {
-  auto mask = rewriter.create<MaskOp>(op->getLoc(), op->getResultTypes(), pred);
-  rewriter.createBlock(&mask->getRegion(0));
-  rewriter.setInsertionPointToStart(&mask->getRegion(0).front());
-  auto newOp = rewriter.clone(*op);
-  rewriter.create<MaskReturnOp>(op->getLoc(), newOp->getResults());
-  op->replaceAllUsesWith(mask->getResults());
-  rewriter.eraseOp(op);
-  return mask;
-}
-
-static void resolveMaskOp(ModuleOp moduleOp, DenseSet<MaskOp> &peeledMaskOps) {
-  IRRewriter rewriter(moduleOp);
-
-  // Canonicalize the IR to simplify the arithmetic ops defining the mask
-  auto arithDialect =
-      moduleOp.getContext()->getLoadedDialect<arith::ArithDialect>();
-  RewritePatternSet patterns(moduleOp.getContext());
-  arithDialect->getCanonicalizationPatterns(patterns);
-  if (applyPatternsGreedily(moduleOp, std::move(patterns)).failed())
-    return llvm::report_fatal_error("Failed to canonicalize the IR");
-
-  // Prune all the statically dead mask ops in the epilogue. This is a
-  // hack, ideally we should do it for all the mask ops, but it is incorrect if
-  // we have speculatively executed async cp operations that will store to shmem
-  // even if the mask is false.
-  for (auto maskOp : peeledMaskOps) {
-    rewriter.setInsertionPoint(maskOp);
-    while (&maskOp.getBody()->front() != maskOp.getBody()->getTerminator()) {
-      Operation *op = &maskOp.getBody()->front();
-      if (isConstantIntValue(maskOp.getPred(), 0)) {
-        if (op->getNumResults() > 0) {
-          SmallVector<Value> results;
-          for (auto result : op->getResults()) {
-            auto poisonOp =
-                rewriter.create<ub::PoisonOp>(op->getLoc(), result.getType());
-            results.push_back(poisonOp);
-          }
-          op->replaceAllUsesWith(results);
-        }
-        op->erase();
-      }
-    }
-  }
-
-  SmallVector<MaskOp> maskOps;
-  moduleOp->walk([&](MaskOp maskOp) { maskOps.push_back(maskOp); });
-  for (auto maskOp : maskOps) {
-    rewriter.setInsertionPoint(maskOp);
-    while (&maskOp.getBody()->front() != maskOp.getBody()->getTerminator()) {
-      Operation *op = &maskOp.getBody()->front();
-      rewriter.moveOpBefore(op, maskOp);
-      op = triton::predicateOp(rewriter, op, maskOp.getPred());
-    }
-    maskOp->replaceAllUsesWith(
-        maskOp.getBody()->getTerminator()->getOperands());
-    maskOp->erase();
-  }
-}
-
 static bool hasMMAv5WaitsInLastStage(scf::ForOp forOp,
                                      CoarseSchedule &schedule) {
   int maxStage = schedule.getNumStages() - 1;
 
@@ -116,11 +116,11 @@ Attribute getTmemLoadStoreLayout32x32b(unsigned M, unsigned N,
     unsigned numWarpGroups = numWarps / 4;
     if (numBlocks == 1) {
       // Split along the N dimension
-      sizePerThread = {1, N / (numWarpGroups * 2)};
+      sizePerThread = {1, ceil<unsigned>(N, numWarpGroups * 2)};
       threadsPerWarp = {16, 2};
       warpsPerCTA = {4, numWarpGroups};
     } else {
-      sizePerThread = {1, N / 2};
+      sizePerThread = {1, ceil<unsigned>(N, 2)};
       threadsPerWarp = {16, 2};
       warpsPerCTA = {0, 0};
       // Distribute at most as many warp groups as there is blocks
@@ -138,7 +138,7 @@ Attribute getTmemLoadStoreLayout32x32b(unsigned M, unsigned N,
       warpsPerCTA = {4 * numWarpGroups, 1};
     } else {
       // Split along N dimension
-      sizePerThread = {1, N / numWarpGroups};
+      sizePerThread = {1, ceil<unsigned>(N, numWarpGroups)};
       threadsPerWarp = {32, 1};
       warpsPerCTA = {4, numWarpGroups};
     }
@@ -223,6 +223,22 @@ bool isDistributedLayoutTMemCompatible(Operation *op,
   return areLayoutsEquivalent(tensorType.getShape(), layout, enc);
 }
 
+LogicalResult impl::verifyMMAv5Op(Operation *op) {
+  auto isInterleaved = [](MemDescType memdesc) {
+    auto enc = dyn_cast<TensorMemoryEncodingAttr>(memdesc.getEncoding());
+    return enc && getTmemAllocSizes(memdesc).numRows != 64 &&
+           enc.getBlockM() == 64;
+  };
+
+  auto itf = cast<MMAv5OpInterface>(op);
+  if (isInterleaved(itf.getA().getType()) &&
+      isInterleaved(itf.getAccumulator().getType())) {
+    return op->emitOpError(
+        "does not support blockM=64 with interleaved blocks in TMEM layout");
+  }
+  return success();
+}
+
 } // namespace nvidia_gpu
 } // namespace triton
 } // namespace mlir
Original file line number	Diff line number	Diff line change
`@@ -39,8 +39,8 @@ static unsigned getBitwidth(RankedTensorType ty) {`
`39`	`39`	`return isPtr ? kPtrBitWidth : std::max(ty.getElementTypeBitWidth(), 8u);`
`40`	`40`	`}`
`41`	`41`
`42`		`-static unsigned getNumScratchElemsSwizzledCvt(RankedTensorType srcTy,`
`43`		`- RankedTensorType dstTy) {`
	`42`	`+unsigned getNumScratchElemsSwizzledCvt(RankedTensorType srcTy,`
	`43`	`+ RankedTensorType dstTy) {`
`44`	`44`	`auto *ctx = srcTy.getContext();`
`45`	`45`	`auto srcLayout = gpu::toLinearLayout(srcTy);`
`46`	`46`	`auto dstLayout = gpu::toLinearLayout(dstTy);`
`@@ -52,8 +52,8 @@ static unsigned getNumScratchElemsSwizzledCvt(RankedTensorType srcTy,`
`52`	`52`	`return smem.getTotalOutDimSize() / reps;`
`53`	`53`	`}`
`54`	`54`
`55`		`-static unsigned getNumScratchElemsPaddedCvt(RankedTensorType srcTy,`
`56`		`- RankedTensorType dstTy) {`
	`55`	`+unsigned getNumScratchElemsPaddedCvt(RankedTensorType srcTy,`
	`56`	`+ RankedTensorType dstTy) {`
`57`	`57`	`auto scratchConfig = getScratchConfigForCvt(srcTy, dstTy);`
`58`	`58`	`return getNumScratchElements(scratchConfig.paddedRepShape);`
`59`	`59`	`}`