[Blackwell] Move optimizeTMemLoad and tmem load subtiling into one pass (triton-lang#6715)

Mogball · web-flow · commit 12f8ae721d1a · 2025-05-05T15:02:42.000-07:00
Follow-up to triton-lang#6694 that moves `optimizeTMemLoad` (picks a splitM layout when the result is fed into a reduction) into the optimize tmem load subtiling pass and renames the pass to `optimize-tmem-layouts`. This separates the optimization from accelerate matmul and allows the relayout pass to not have to run it.
diff --git a/include/triton/Dialect/TritonNvidiaGPU/Transforms/Passes.h b/include/triton/Dialect/TritonNvidiaGPU/Transforms/Passes.h
@@ -62,7 +62,7 @@ std::unique_ptr<Pass> createTritonNvidiaGPURemoveTMEMTokensPass();
 
 std::unique_ptr<Pass> createTritonNvidiaGPUOptimizeDescriptorEncodingPass();
 
-std::unique_ptr<Pass> createTritonNvidiaGPUOptimizeTMemSubtilingPass();
+std::unique_ptr<Pass> createTritonNvidiaGPUOptimizeTMemLayoutsPass();
 
 /// Generate the code for registering passes.
 #define GEN_PASS_REGISTRATION
diff --git a/include/triton/Dialect/TritonNvidiaGPU/Transforms/Passes.td b/include/triton/Dialect/TritonNvidiaGPU/Transforms/Passes.td
@@ -130,11 +130,12 @@ def TritonNvidiaGPUOptimizeDescriptorEncodingPass : Pass<"triton-nvidia-optimize
                            "mlir::triton::TritonDialect"];
 }
 
-def TritonNvidiaGPUOptimizeTMemSubtilingPass : Pass<"triton-nvidia-optimize-tmem-subtiling", "mlir::ModuleOp"> {
-  let summary = "Optimize subtiling.";
+def TritonNvidiaGPUOptimizeTMemLayoutsPass : Pass<"triton-nvidia-optimize-tmem-layouts", "mlir::ModuleOp"> {
+  let summary = "Optimize TMEM layouts.";
 
   let description = [{
-    Optimize subtiling by trying to split tmem_load when user splits a tensor.
+    Optimize TMEM layouts by selecting a layouts to enable better subtiling,
+    reduction performance, etc.
   }];
 
   let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect",
diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
@@ -807,63 +807,6 @@ static void decomposeMixedModeDotOp(ModuleOp mod, int computeCapability) {
   });
 }
 
-// When there are multiple warpgroups tmem_load results can be distirbuted along
-// M or N across the warpgroups. By default distribute along N but when there is
-// a reduction along N dimension we want to distribute along M instead to avoid
-// having to reduce across warps.
-static void optimizeTMemLoad(ModuleOp mod) {
-  SmallVector<triton::nvidia_gpu::TMEMLoadOp> tmemLoads;
-  mod.walk([&](triton::nvidia_gpu::TMEMLoadOp tmemLoadOp) -> void {
-    tmemLoads.push_back(tmemLoadOp);
-  });
-  for (triton::nvidia_gpu::TMEMLoadOp tmemLoadOp : tmemLoads) {
-    int numWarps = lookupNumWarps(tmemLoadOp);
-    // If there is only 1 warpgroup there is nothing to optimize as the layout
-    // is already reduction friendly.
-    if (numWarps != 8)
-      return;
-    auto tmemEnc = dyn_cast<triton::nvidia_gpu::TensorMemoryEncodingAttr>(
-        tmemLoadOp.getSrc().getType().getEncoding());
-    if (!tmemEnc)
-      continue;
-    int M = tmemEnc.getBlockM();
-    int N = tmemEnc.getBlockN();
-    if (M != 128)
-      continue;
-    bool foundReductionAlongN = false;
-    auto filter = [&](Operation *op) {
-      if (isa<ConvertLayoutOp>(op) || op->hasTrait<OpTrait::Elementwise>())
-        return true;
-      if (auto reduce = dyn_cast<triton::ReduceOp>(op)) {
-        foundReductionAlongN = reduce.getAxis() == 1;
-      }
-      return false;
-    };
-    ForwardSliceOptions fwdOpt;
-    fwdOpt.filter = filter;
-    SetVector<mlir::Operation *> fwdSlices;
-    getForwardSlice(tmemLoadOp.getResult(), &fwdSlices, fwdOpt);
-    if (!foundReductionAlongN)
-      continue;
-    // Try to split along M dimension but follow the restrictions of TMEM:
-    // warp0 get M = 0, warp 1 gets M = 32, warp 2 gets M = 64, warp 3 gets
-    // M = 96 warp 4 gets M = 16, warp 5 gets M = 48, warp 6 gets M = 80,
-    // warp 7 gets M = 112
-    RankedTensorType oldType = tmemLoadOp.getType();
-    Attribute newLayout = triton::gpu::LinearEncodingAttr::get(
-        tmemLoadOp.getContext(),
-        getTmemLoadLayoutSplitLongM(M, N, oldType, numWarps));
-    auto newType = RankedTensorType::get(oldType.getShape(),
-                                         oldType.getElementType(), newLayout);
-    tmemLoadOp.getResult().setType(newType);
-    OpBuilder builder(tmemLoadOp);
-    builder.setInsertionPointAfter(tmemLoadOp);
-    auto cvt = builder.create<ConvertLayoutOp>(tmemLoadOp.getLoc(), oldType,
-                                               tmemLoadOp.getResult());
-    tmemLoadOp.getResult().replaceAllUsesExcept(cvt.getResult(), cvt);
-  }
-}
-
 // Transpose scaled_dot ops that have a scale on lhs.
 static void transposeDotOp(DotScaledOp dotOp) {
   OpBuilder builder(dotOp);
@@ -931,9 +874,6 @@ class TritonGPUAccelerateMatmulPass
     // Now that we have picked the mma type, decompose dot that are not natively
     // supported.
     decomposeMixedModeDotOp(m, computeCapability);
-
-    // Pick an optimized tmem load layout based on its users.
-    optimizeTMemLoad(m);
   }
 };
 
diff --git a/lib/Dialect/TritonNvidiaGPU/Transforms/CMakeLists.txt b/lib/Dialect/TritonNvidiaGPU/Transforms/CMakeLists.txt
@@ -2,7 +2,7 @@ add_triton_library(TritonNvidiaGPUTransforms
   FenceInsertion.cpp
   MMALowering.cpp
   OptimizeDescriptorEncoding.cpp
-  OptimizeTMemSubtiling.cpp
+  OptimizeTMemLayouts.cpp
   PlanCTA.cpp
   PromoteLHSToTMem.cpp
   RemoveTMEMTokens.cpp
diff --git a/lib/Dialect/TritonNvidiaGPU/Transforms/OptimizeTMemLayouts.cpp b/lib/Dialect/TritonNvidiaGPU/Transforms/OptimizeTMemLayouts.cpp
@@ -1,3 +1,4 @@
+#include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
@@ -181,27 +182,89 @@ class TMemSplitLoadPattern : public OpRewritePattern<tt::SplitOp> {
   }
 };
 
-class TritonNvidiaGPUOptimizeTMemSubtilingPass
-    : public TritonNvidiaGPUOptimizeTMemSubtilingPassBase<
-          TritonNvidiaGPUOptimizeTMemSubtilingPass> {
+// Pick an optimized tmem load layout based on its users. When there are
+// multiple warpgroups tmem_load results can be distirbuted along M or N across
+// the warpgroups. By default distribute along N but when there is a reduction
+// along N dimension we want to distribute along M instead to avoid having to
+// reduce across warps.
+class TMemLoadReducePattern : public OpRewritePattern<ttng::TMEMLoadOp> {
 public:
-  using BaseT = TritonNvidiaGPUOptimizeTMemSubtilingPassBase<
-      TritonNvidiaGPUOptimizeTMemSubtilingPass>;
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(ttng::TMEMLoadOp tmemLoadOp,
+                                PatternRewriter &rewriter) const override {
+    int numWarps = ttg::lookupNumWarps(tmemLoadOp);
+    // If there is only 1 warpgroup there is nothing to optimize as the layout
+    // is already reduction friendly.
+    if (numWarps != 8)
+      return failure();
+    auto tmemEnc = dyn_cast<triton::nvidia_gpu::TensorMemoryEncodingAttr>(
+        tmemLoadOp.getSrc().getType().getEncoding());
+    if (!tmemEnc)
+      return failure();
+    int M = tmemEnc.getBlockM();
+    int N = tmemEnc.getBlockN();
+    if (M != 128)
+      return failure();
+    bool foundReductionAlongN = false;
+    auto filter = [&](Operation *op) {
+      if (isa<ttg::ConvertLayoutOp>(op) || op->hasTrait<OpTrait::Elementwise>())
+        return true;
+      if (auto reduce = dyn_cast<triton::ReduceOp>(op)) {
+        foundReductionAlongN = reduce.getAxis() == 1;
+      }
+      return false;
+    };
+    ForwardSliceOptions fwdOpt;
+    fwdOpt.filter = filter;
+    SetVector<mlir::Operation *> fwdSlices;
+    getForwardSlice(tmemLoadOp.getResult(), &fwdSlices, fwdOpt);
+    if (!foundReductionAlongN)
+      return failure();
+    // Try to split along M dimension but follow the restrictions of TMEM:
+    // warp0 get M = 0, warp 1 gets M = 32, warp 2 gets M = 64, warp 3 gets
+    // M = 96 warp 4 gets M = 16, warp 5 gets M = 48, warp 6 gets M = 80,
+    // warp 7 gets M = 112
+    RankedTensorType oldType = tmemLoadOp.getType();
+    Attribute newLayout = ttg::LinearEncodingAttr::get(
+        tmemLoadOp.getContext(),
+        ttg::getTmemLoadLayoutSplitLongM(M, N, oldType, numWarps));
+    if (newLayout == oldType.getEncoding())
+      return failure();
+
+    auto newType = RankedTensorType::get(oldType.getShape(),
+                                         oldType.getElementType(), newLayout);
+    tmemLoadOp.getResult().setType(newType);
+    OpBuilder builder(tmemLoadOp);
+    builder.setInsertionPointAfter(tmemLoadOp);
+    auto cvt = builder.create<ttg::ConvertLayoutOp>(
+        tmemLoadOp.getLoc(), oldType, tmemLoadOp.getResult());
+    tmemLoadOp.getResult().replaceAllUsesExcept(cvt.getResult(), cvt);
+    return success();
+  }
+};
+
+class TritonNvidiaGPUOptimizeTMemLayoutsPass
+    : public TritonNvidiaGPUOptimizeTMemLayoutsPassBase<
+          TritonNvidiaGPUOptimizeTMemLayoutsPass> {
+public:
+  using BaseT = TritonNvidiaGPUOptimizeTMemLayoutsPassBase<
+      TritonNvidiaGPUOptimizeTMemLayoutsPass>;
   using BaseT::BaseT;
 
   void runOnOperation() override {
     MLIRContext *context = &getContext();
     ModuleOp m = getOperation();
 
     mlir::RewritePatternSet patterns(context);
-    patterns.add<TMemSplitLoadPattern>(context);
+    patterns.add<TMemSplitLoadPattern, TMemLoadReducePattern>(context);
     if (failed(applyPatternsGreedily(m, std::move(patterns))))
       signalPassFailure();
   }
 };
 
 } // namespace
 
-std::unique_ptr<Pass> mlir::createTritonNvidiaGPUOptimizeTMemSubtilingPass() {
-  return std::make_unique<TritonNvidiaGPUOptimizeTMemSubtilingPass>();
+std::unique_ptr<Pass> mlir::createTritonNvidiaGPUOptimizeTMemLayoutsPass() {
+  return std::make_unique<TritonNvidiaGPUOptimizeTMemLayoutsPass>();
 }
diff --git a/test/TritonGPU/accelerate-matmul.mlir b/test/TritonGPU/accelerate-matmul.mlir
@@ -511,25 +511,6 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.targ
 
 // -----
 
-#blocked = #ttg.blocked<{sizePerThread = [4, 4], threadsPerWarp = [2, 16], warpsPerCTA = [8, 1], order = [1, 0]}>
-module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:100", "ttg.threads-per-warp" = 32 : i32} {
-  // CHECK{LITERALE}: #linear = #ttg.linear<{register = [[0, 1], [0, 2], [0, 4], [0, 8], [0, 16]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [0, 32]], warp = [[32, 0], [64, 0], [16, 0]], block = []}>
-  // CHECK-LABEL: dot_reduce
-  tt.func public @dot_reduce(%arg0: tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #blocked}>>, %arg1: tensor<64x64xf16, #ttg.dot_op<{opIdx = 1, parent = #blocked}>>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>> {
-    %cst = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #blocked>
-    %0 = tt.dot %arg0, %arg1, %cst, inputPrecision = tf32 : tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> * tensor<64x64xf16, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<128x64xf32, #blocked>
-    // ttng.tmem_load %{{.*}} : !ttg.memdesc<128x64xf32, #tmem, #ttng.tensor_memory, mutable> -> tensor<128x64xf32, #linear>
-    %1 = "tt.reduce"(%0) <{axis = 1 : i32}> ({
-    ^bb0(%arg2: f32, %arg3: f32):
-      %2 = arith.addf %arg2, %arg3 : f32
-      tt.reduce.return %2 : f32
-    }) : (tensor<128x64xf32, #blocked>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>>
-    tt.return %1 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>>
-  }
-}
-
-// -----
-
 // CHECK-DAG: #[[$SHARED_A:.+]] = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 8}>
 // CHECK-DAG: #[[$SHARED_B:.+]] = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 8, fp4Padded = true}>
 #blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0]}>
diff --git a/test/TritonGPU/optimize-partition-warps.mlir b/test/TritonGPU/optimize-partition-warps.mlir
@@ -163,27 +163,4 @@ tt.func @tmem_min_4_warps(%tensor_desc: !ttg.memdesc<64x64xf32, #tmem, #ttng.ten
   tt.return
 }
 
-tt.func @tmem_split_m_layout(%tensor_desc: !ttg.memdesc<128x64xf32, #tmem1, #ttng.tensor_memory, mutable>) {
-  ttg.warp_specialize(%tensor_desc)
-  default {
-    ttg.warp_yield
-  }
-  // CHECK: partition0{{.*}} num_warps(8)
-  partition0(%desc: !ttg.memdesc<128x64xf32, #tmem1, #ttng.tensor_memory, mutable>) num_warps(16) {
-    // CHECK: ttng.tmem_load {{.*}} -> tensor<128x64xf32, #linear>
-    %0 = ttng.tmem_load %desc : !ttg.memdesc<128x64xf32, #tmem1, #ttng.tensor_memory, mutable> -> tensor<128x64xf32, #blocked2d_16>
-
-    %1 = "tt.reduce"(%0) <{axis = 1 : i32}> ({
-    ^bb0(%arg2: f32, %arg3: f32):
-      %2 = arith.addf %arg2, %arg3 : f32
-      tt.reduce.return %2 : f32
-    }) : (tensor<128x64xf32, #blocked2d_16>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked2d_16}>>
-
-    %cst = arith.constant dense<0.0> : tensor<128x128xf32, #blocked2d_16>
-    "use"(%1, %cst) : (tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked2d_16}>>, tensor<128x128xf32, #blocked2d_16>) -> ()
-    ttg.warp_return
-  } : (!ttg.memdesc<128x64xf32, #tmem1, #ttng.tensor_memory, mutable>) -> ()
-  tt.return
-}
-
 }
diff --git a/test/TritonNvidiaGPU/tmem_layouts.mlir b/test/TritonNvidiaGPU/tmem_layouts.mlir
@@ -1,4 +1,4 @@
-// RUN: triton-opt %s -split-input-file --triton-nvidia-optimize-tmem-subtiling --allow-unregistered-dialect | FileCheck %s
+// RUN: triton-opt %s -split-input-file --triton-nvidia-optimize-tmem-layouts --allow-unregistered-dialect | FileCheck %s
 
 #blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [4, 2], order = [1, 0]}>
 #blocked1 = #ttg.blocked<{sizePerThread = [1, 64], threadsPerWarp = [32, 1], warpsPerCTA = [4, 2], order = [0, 1]}>
@@ -106,3 +106,25 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.targ
     tt.return %5, %6, %11 : tensor<128x64xf16, #blocked>, tensor<128x64xf16, #blocked>, tensor<128x64xf16, #blocked>
   }
 }
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [4, 4], threadsPerWarp = [2, 16], warpsPerCTA = [8, 1], order = [1, 0]}>
+#tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 64, unpacked = true>
+
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:100", "ttg.threads-per-warp" = 32 : i32} {
+
+// CHECK{LITERAL}: #linear = #ttg.linear<{register = [[0, 1], [0, 2], [0, 4], [0, 8], [0, 16]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [0, 32]], warp = [[32, 0], [64, 0], [16, 0]], block = []}>
+// CHECK-LABEL: tmem_load_reduce
+tt.func public @tmem_load_reduce(%arg0: !ttg.memdesc<128x64xf32, #tmem, #ttng.tensor_memory>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>> {
+  %0 = ttng.tmem_load %arg0 : !ttg.memdesc<128x64xf32, #tmem, #ttng.tensor_memory> -> tensor<128x64xf32, #blocked>
+  // CHECK: ttng.tmem_load %{{.*}} : !ttg.memdesc<128x64xf32, #tmem, #ttng.tensor_memory> -> tensor<128x64xf32, #linear>
+  %1 = "tt.reduce"(%0) <{axis = 1 : i32}> ({
+  ^bb0(%arg2: f32, %arg3: f32):
+    %2 = arith.addf %arg2, %arg3 : f32
+    tt.reduce.return %2 : f32
+  }) : (tensor<128x64xf32, #blocked>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>>
+  tt.return %1 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>>
+}
+
+}
diff --git a/third_party/nvidia/backend/compiler.py b/third_party/nvidia/backend/compiler.py
@@ -270,7 +270,7 @@ def make_ttgir(mod, metadata, opt, capability):
         passes.ttgpuir.add_WGMMAPrefetch(pm)
         passes.ttgpuir.add_optimize_dot_operands(pm, capability >= 80)
         passes.ttgpuir.add_coalesce_async_copy(pm)
-        nvidia.passes.ttnvgpuir.add_optimize_tmem_subtiling(pm)
+        nvidia.passes.ttnvgpuir.add_optimize_tmem_layouts(pm)
         passes.ttgpuir.add_remove_layout_conversions(pm)
         passes.ttgpuir.add_reduce_data_duplication(pm)
         passes.ttgpuir.add_reorder_instructions(pm)
diff --git a/third_party/nvidia/triton_nvidia.cc b/third_party/nvidia/triton_nvidia.cc
@@ -48,8 +48,8 @@ void init_triton_nvidia_passes_ttnvgpuir(py::module &&m) {
                      mlir::createTritonNvidiaGPUMMALoweringPass);
   ADD_PASS_WRAPPER_0("add_optimize_descriptor_encoding",
                      mlir::createTritonNvidiaGPUOptimizeDescriptorEncodingPass);
-  ADD_PASS_WRAPPER_0("add_optimize_tmem_subtiling",
-                     mlir::createTritonNvidiaGPUOptimizeTMemSubtilingPass);
+  ADD_PASS_WRAPPER_0("add_optimize_tmem_layouts",
+                     mlir::createTritonNvidiaGPUOptimizeTMemLayoutsPass);
 }
 
 void init_triton_nvidia_passes_nvws(py::module &&m) {