[Blackwell] Hoist constant TMem allocation out of the loop (#5857)

ThomasRaoux · web-flow · commit d8278513b831 · 2025-02-07T15:45:28.000-08:00
For cases where TMem is constant hoisting the allocation of the loop
avoid having to store multiple times.
diff --git a/lib/Dialect/TritonNvidiaGPU/Transforms/KeepAccInTMem.cpp b/lib/Dialect/TritonNvidiaGPU/Transforms/KeepAccInTMem.cpp
@@ -79,6 +79,138 @@ class TMEMToGlobal : public OpRewritePattern<triton::StoreOp> {
   }
 };
 
+static void addTMEMLoad(IRRewriter &rewriter, ttng::TMEMAllocOp localAlloc,
+                        Operation *user, int argNo) {
+  rewriter.setInsertionPoint(user);
+  auto load = rewriter.create<ttng::TMEMLoadOp>(
+      user->getLoc(), user->getOperand(argNo).getType(),
+      localAlloc->getResult(0));
+  user->setOperand(argNo, load);
+}
+
+static bool canKeepAccInTmem(scf::ForOp forOp, Operation *mmaOp,
+                             ttng::TMEMAllocOp &localAlloc,
+                             ttng::TMEMLoadOp &localLoad,
+                             SmallVector<std::pair<Operation *, int>> &accUsers,
+                             unsigned &yieldArgNo) {
+  // The expected sequence of instructions:
+  // %acc_tm = ttg.local_alloc %acc
+  // ttng.tc_gen5_mma %A_sh, %B_sh, %acc_tm
+  // %acc_res = ttg.local_load %acc_tm
+  localAlloc = mmaOp->getOperand(2).getDefiningOp<ttng::TMEMAllocOp>();
+  if (!localAlloc) {
+    return false;
+  }
+  for (auto user : localAlloc->getUsers()) {
+    if (isa<ttng::TMEMLoadOp>(user)) {
+      localLoad = cast<ttng::TMEMLoadOp>(user);
+    } else if (user != mmaOp) {
+      // The accumulator is used by another operation, not something we
+      // expect.
+      localLoad = nullptr;
+      return false;
+    }
+  }
+
+  SmallVector<Value> queue;
+  queue.push_back(localLoad->getResult(0));
+  bool foundDotCycle = false;
+  while (!queue.empty()) {
+    Value value = queue.pop_back_val();
+    for (auto &use : value.getUses()) {
+      if (use.getOwner() == localAlloc) {
+        foundDotCycle = true;
+        continue;
+      }
+      if (auto yieldOp = dyn_cast<scf::YieldOp>(use.getOwner())) {
+        if (yieldOp->getParentOp() == forOp) {
+          yieldArgNo = use.getOperandNumber();
+          queue.push_back(forOp.getRegionIterArg(yieldArgNo));
+          continue;
+        }
+        if (auto ifOp = dyn_cast<scf::IfOp>(yieldOp->getParentOp())) {
+          // TODO: Accumulator being used in the yield of ifOp means that
+          // it is being modified in the other branch of the ifOp. This is not
+          // something we can handle yet.
+          return false;
+        }
+        // Not sure what are we doing here. Back out.
+        return false;
+      }
+      accUsers.emplace_back(use.getOwner(), use.getOperandNumber());
+    }
+  }
+  return foundDotCycle;
+}
+
+static void hoistReadModifyWrite(Operation *mmaOp, scf::ForOp forOp) {
+  // For the transformation to make sense, the accumulator must be
+  // reused by the same MMA operation in subsequent iterations.
+  SmallVector<std::pair<Operation *, int>> accUsers;
+  ttng::TMEMAllocOp localAlloc = nullptr;
+  ttng::TMEMLoadOp localLoad = nullptr;
+  unsigned yieldArgNo;
+  if (!canKeepAccInTmem(forOp, mmaOp, localAlloc, localLoad, accUsers,
+                        yieldArgNo)) {
+    return;
+  }
+
+  assert(localLoad != nullptr);
+  assert(localAlloc != nullptr);
+  Type loadType = localLoad->getResult(0).getType();
+  IRRewriter rewriter(forOp);
+  localAlloc->moveBefore(forOp);
+  localAlloc->setOperand(0, forOp.getInitArgs()[yieldArgNo]);
+  mmaOp->setOperand(2, localAlloc->getResult(0));
+  // Unlink the local_load from the yield. Short circuit the unused yield
+  // value with the corresponding iter arg.
+  forOp.getBody()->getTerminator()->setOperand(
+      yieldArgNo, forOp.getRegionIterArg(yieldArgNo));
+
+  // Add TMEM loads before all the uses
+  // TODO: We could be more efficient here, reusing loads instead of
+  // creating new ones for each use.
+  for (auto [user, argNo] : accUsers) {
+    addTMEMLoad(rewriter, localAlloc, user, argNo);
+  }
+
+  rewriter.setInsertionPointAfter(forOp);
+  auto afterLoopLoad = rewriter.create<ttng::TMEMLoadOp>(
+      forOp.getLoc(), loadType, localAlloc->getResult(0));
+  forOp->getResult(yieldArgNo).replaceAllUsesWith(afterLoopLoad->getResult(0));
+
+  localLoad->erase();
+}
+
+// Hoist invariant tmem_alloc. This could technically be done as general LICM
+// but controlling tmem liveranga more precisley is likely to be important.
+static void hoistInvariantInputs(Operation *mmaOp, scf::ForOp forOp) {
+  for (auto operand : mmaOp->getOperands()) {
+    if (forOp.isDefinedOutsideOfLoop(operand))
+      continue;
+    auto tmemAllocOp = operand.getDefiningOp<ttng::TMEMAllocOp>();
+    if (!tmemAllocOp || tmemAllocOp.getType().getMutableMemory())
+      continue;
+    assert(tmemAllocOp.getSrc());
+    Value src = tmemAllocOp.getSrc();
+    SmallVector<Operation *> opToHoist = {tmemAllocOp.getOperation()};
+    // Also hoist simple unary elementwise that may have sinked into the loop.
+    while (Operation *defOp = src.getDefiningOp()) {
+      if (forOp.isDefinedOutsideOfLoop(src))
+        break;
+      if (!(isMemoryEffectFree(defOp) && isSpeculatable(defOp) &&
+            defOp->getNumOperands() == 1))
+        break;
+      opToHoist.push_back(defOp);
+      src = defOp->getOperand(0);
+    }
+    if (!forOp.isDefinedOutsideOfLoop(src))
+      continue;
+    for (auto op : llvm::reverse(opToHoist)) {
+      forOp.moveOutOfLoop(op);
+    }
+  }
+}
 class TritonNvidiaGPUKeepAccInTMemPass
     : public TritonNvidiaGPUKeepAccInTMemPassBase<
           TritonNvidiaGPUKeepAccInTMemPass> {
@@ -99,70 +231,6 @@ class TritonNvidiaGPUKeepAccInTMemPass
     }
   }
 
-  bool canKeepAccInTmem(scf::ForOp forOp, Operation *mmaOp,
-                        ttng::TMEMAllocOp &localAlloc,
-                        ttng::TMEMLoadOp &localLoad,
-                        SmallVector<std::pair<Operation *, int>> &accUsers,
-                        unsigned &yieldArgNo) {
-    // The expected sequence of instructions:
-    // %acc_tm = ttg.local_alloc %acc
-    // ttng.tc_gen5_mma %A_sh, %B_sh, %acc_tm
-    // %acc_res = ttg.local_load %acc_tm
-    localAlloc = mmaOp->getOperand(2).getDefiningOp<ttng::TMEMAllocOp>();
-    if (!localAlloc) {
-      return false;
-    }
-    for (auto user : localAlloc->getUsers()) {
-      if (isa<ttng::TMEMLoadOp>(user)) {
-        localLoad = cast<ttng::TMEMLoadOp>(user);
-      } else if (user != mmaOp) {
-        // The accumulator is used by another operation, not something we
-        // expect.
-        localLoad = nullptr;
-        return false;
-      }
-    }
-
-    SmallVector<Value> queue;
-    queue.push_back(localLoad->getResult(0));
-    bool foundDotCycle = false;
-    while (!queue.empty()) {
-      Value value = queue.pop_back_val();
-      for (auto &use : value.getUses()) {
-        if (use.getOwner() == localAlloc) {
-          foundDotCycle = true;
-          continue;
-        }
-        if (auto yieldOp = dyn_cast<scf::YieldOp>(use.getOwner())) {
-          if (yieldOp->getParentOp() == forOp) {
-            yieldArgNo = use.getOperandNumber();
-            queue.push_back(forOp.getRegionIterArg(yieldArgNo));
-            continue;
-          }
-          if (auto ifOp = dyn_cast<scf::IfOp>(yieldOp->getParentOp())) {
-            // TODO: Accumulator being used in the yield of ifOp means that
-            // it is being modified in the other branch of the ifOp. This is not
-            // something we can handle yet.
-            return false;
-          }
-          // Not sure what are we doing here. Back out.
-          return false;
-        }
-        accUsers.emplace_back(use.getOwner(), use.getOperandNumber());
-      }
-    }
-    return foundDotCycle;
-  }
-
-  void addTMEMLoad(IRRewriter &rewriter, ttng::TMEMAllocOp localAlloc,
-                   Operation *user, int argNo) {
-    rewriter.setInsertionPoint(user);
-    auto load = rewriter.create<ttng::TMEMLoadOp>(
-        user->getLoc(), user->getOperand(argNo).getType(),
-        localAlloc->getResult(0));
-    user->setOperand(argNo, load);
-  }
-
   void runOnForOp(scf::ForOp forOp) {
     SmallVector<Operation *> mmaOps;
     forOp.walk([&](Operation *mmaOp) {
@@ -177,43 +245,8 @@ class TritonNvidiaGPUKeepAccInTMemPass
     }
 
     for (auto mmaOp : mmaOps) {
-      // For the transformation to make sense, the accumulator must be
-      // reused by the same MMA operation in subsequent iterations.
-      SmallVector<std::pair<Operation *, int>> accUsers;
-      ttng::TMEMAllocOp localAlloc = nullptr;
-      ttng::TMEMLoadOp localLoad = nullptr;
-      unsigned yieldArgNo;
-      if (!canKeepAccInTmem(forOp, mmaOp, localAlloc, localLoad, accUsers,
-                            yieldArgNo)) {
-        continue;
-      }
-
-      assert(localLoad != nullptr);
-      assert(localAlloc != nullptr);
-      Type loadType = localLoad->getResult(0).getType();
-      IRRewriter rewriter(forOp);
-      localAlloc->moveBefore(forOp);
-      localAlloc->setOperand(0, forOp.getInitArgs()[yieldArgNo]);
-      mmaOp->setOperand(2, localAlloc->getResult(0));
-      // Unlink the local_load from the yield. Short circuit the unused yield
-      // value with the corresponding iter arg.
-      forOp.getBody()->getTerminator()->setOperand(
-          yieldArgNo, forOp.getRegionIterArg(yieldArgNo));
-
-      // Add TMEM loads before all the uses
-      // TODO: We could be more efficient here, reusing loads instead of
-      // creating new ones for each use.
-      for (auto [user, argNo] : accUsers) {
-        addTMEMLoad(rewriter, localAlloc, user, argNo);
-      }
-
-      rewriter.setInsertionPointAfter(forOp);
-      auto afterLoopLoad = rewriter.create<ttng::TMEMLoadOp>(
-          forOp.getLoc(), loadType, localAlloc->getResult(0));
-      forOp->getResult(yieldArgNo)
-          .replaceAllUsesWith(afterLoopLoad->getResult(0));
-
-      localLoad->erase();
+      hoistReadModifyWrite(mmaOp, forOp);
+      hoistInvariantInputs(mmaOp, forOp);
     }
   }
 };
diff --git a/python/test/unit/language/test_matmul.py b/python/test/unit/language/test_matmul.py
@@ -761,6 +761,7 @@ def mxfp8_mxfp4_matmul(  #
         BLOCK_N: tl.constexpr,  #
         BLOCK_K: tl.constexpr,  #
         NUM_STAGES: tl.constexpr):  #
+    tensor_scale: tl.constexpr = isinstance(a_scale.dtype, tl.pointer_type)
     pid = tl.program_id(axis=0)
     num_pid_m = tl.cdiv(M, BLOCK_M)
     pid_m = pid % num_pid_m
@@ -781,7 +782,10 @@ def mxfp8_mxfp4_matmul(  #
     for k in tl.range(0, tl.cdiv(K, BLOCK_K), num_stages=NUM_STAGES):
         a = tl.load(a_ptrs)
         b = tl.load(b_ptrs)
-        scale_a = tl.load(a_scale_ptr)
+        if tensor_scale:
+            scale_a = tl.load(a_scale_ptr)
+        else:
+            scale_a = tl.full(a_scale_ptr.shape, a_scale.to(tl.int8), dtype=tl.int8)
         scale_b = tl.load(b_scale_ptr)
         accumulator = tl.dot_scaled(a, scale_a, "e5m2", b, scale_b, "e2m1", accumulator)
         a_ptrs += BLOCK_K * stride_ak
@@ -801,8 +805,9 @@ def mxfp8_mxfp4_matmul(  #
                                                        (128, 256, 256), (128, 128, 64), (128, 64, 128)])
 @pytest.mark.parametrize("NUM_STAGES", [1, 3])
 @pytest.mark.parametrize("B_TRANS", [True, False])
+@pytest.mark.parametrize("CONST_SCALE", [True, False])
 @pytest.mark.skipif(torch.cuda.get_device_capability()[0] < 10, reason="Requires compute capability >= 10")
-def test_mxfp8_mxfp4_matmul(M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, NUM_STAGES, B_TRANS, device):
+def test_mxfp8_mxfp4_matmul(M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, NUM_STAGES, B_TRANS, CONST_SCALE, device):
     if BLOCK_N == 256 and BLOCK_K == 256:
         NUM_STAGES = 2
 
@@ -826,12 +831,15 @@ def test_mxfp8_mxfp4_matmul(M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, NUM_STAGES, B_TR
     b_scale = b_scale_mxfp4.data
 
     a_scale_ref = a_scale_mxfp4.to(torch.float32).repeat_interleave(32, dim=1)[:M, :K]
+    if CONST_SCALE:
+        a_scale_ref = torch.full_like(a_scale_ref, 2.0)
+        a_scale = 128  # 2.0 in e8m0
     b_scale_ref = b_scale_mxfp4.to(torch.float32).repeat_interleave(32, dim=1).T.contiguous()[:K, :N]
     ref_out = torch.matmul(a_ref * a_scale_ref, b_ref * b_scale_ref)
 
     output = a.new_empty((M, N), dtype=torch.float32)
     grid = (triton.cdiv(M, BLOCK_M) * triton.cdiv(N, BLOCK_N), 1)
-    out = mxfp8_mxfp4_matmul[grid](a, b, output, a_scale, b_scale, M, N, K, a_scale.stride(0), a.stride(0), a.stride(1),
+    out = mxfp8_mxfp4_matmul[grid](a, b, output, a_scale, b_scale, M, N, K, b_scale.stride(0), a.stride(0), a.stride(1),
                                    b.stride(0), b.stride(1), output.stride(0), output.stride(1), BLOCK_M, BLOCK_N,
                                    BLOCK_K, NUM_STAGES=NUM_STAGES)
     ttgir = out.asm["ttgir"]
diff --git a/test/TritonGPU/blackwell_acc_tmem.mlir b/test/TritonGPU/blackwell_acc_tmem.mlir
@@ -111,3 +111,33 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
     tt.return %res_f16 : tensor<128x128xf16, #blocked>
   }
 }
+
+// -----
+#blocked = #ttg.blocked<{sizePerThread = [1, 16], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [32, 1], warpsPerCTA = [1, 4], order = [1, 0]}>
+#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 8}>
+#shared1 = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = true, elementBitWidth = 8, fp4Padded = true}>
+#smem = #ttg.shared_memory
+#tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, unpacked = true>
+#tmem_scales = #ttng.tensor_memory_scales_encoding<>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:100", "ttg.threads-per-warp" = 32 : i32} {
+  // CHECK-LABEL: @hoist_constant_inputs
+  tt.func public @hoist_constant_inputs(%arg0: !ttg.memdesc<128x128xf8E5M2, #shared, #smem>, %arg1: !ttg.memdesc<64x128xi8, #shared1, #smem>, %arg2: !ttg.memdesc<128x4xi8, #tmem_scales, #ttng.tensor_memory>, %arg3: i32, %arg4: !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>) {
+    %true = arith.constant true
+    %cst = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #blocked>
+    %c0_i32 = arith.constant 0 : i32
+    %c1_i32 = arith.constant 1 : i32
+    // CHECK: arith.trunci
+    // CHECK: tt.splat
+    // CHECK: ttng.tmem_alloc
+    // CHECK: scf.for
+    // CHECK:  ttng.tc_gen5_mma_scaled
+    scf.for %arg5 = %c0_i32 to %arg3 step %c1_i32  : i32 {
+      %0 = arith.trunci %arg3 : i32 to i8
+      %1 = tt.splat %0 : i8 -> tensor<128x4xi8, #blocked1>
+      %2 = ttng.tmem_alloc %1 : (tensor<128x4xi8, #blocked1>) -> !ttg.memdesc<128x4xi8, #tmem_scales, #ttng.tensor_memory>
+      ttng.tc_gen5_mma_scaled %arg0, %arg1, %arg4, %arg2, %2, %true, %true lhs = e5m2 rhs = e2m1 : (!ttg.memdesc<128x128xf8E5M2, #shared, #smem>, !ttg.memdesc<64x128xi8, #shared1, #smem>, !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>, !ttg.memdesc<128x4xi8, #tmem_scales, #ttng.tensor_memory>, !ttg.memdesc<128x4xi8, #tmem_scales, #ttng.tensor_memory>, i1, i1) -> ()
+    }
+    tt.return
+  }
+}