[BE] TCGen5MMAScaledOp accepts scales in shared memory (#6019)

pawelszczerbuk · ThomasRaoux · web-flow · commit 5c051063f10a · 2025-02-26T15:31:51.000-08:00
Enable `TCGen5MMAScaledOp` to accept scales in shared memory, if
efficient lowering from shmem to tmem is possible. This removes the use
of `TMEMCopyOp` from most of the compiler stack, it is left only for the
purposes of lowering `TCGen5MMAScaledOp`. It simplifies IR analysis a
little, and improves semantics of `TCGen5MMAScaledOp`, as no assumptions
about HW pipelining of tmem copy -&gt; mma are needed.

---------

Co-authored-by: Thomas Raoux &lt;thomas.raoux@openai.com&gt;
diff --git a/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td b/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td
@@ -332,7 +332,7 @@ def TTNG_TCGen5MMAOp : TTNG_Op<"tc_gen5_mma", [DeclareOpInterfaceMethods<MemoryE
 
     let description = [{
         $d += matrix_multiply($a, $b).
-        If not barrier is given the op is assumed to be synchronous otherwise the op will trigger a commit/arrive on the given barrier.
+        If no barrier is given the op is assumed to be synchronous otherwise the op will trigger a commit/arrive on the given barrier.
         If there is a barrier the result will be safe to read after a barrier wait.
         If $two_ctas is set the op will execute a matmul across two contiguous CTAs, it will read the data distributed across the two CTAs.
         and syncronize both CTAs if the op is synchronous.
@@ -355,7 +355,7 @@ def TTNG_TCGen5MMAScaledOp : TTNG_Op<"tc_gen5_mma_scaled", [DeclareOpInterfaceMe
 
     let description = [{
         $d += matrix_multiply(scale($lhs, $lhs_scale), scale(rlhs, $rhs_scale))
-        If not barrier is given the op is assumed to be synchronous otherwise the op will trigger a commit/arrive on the given barrier.
+        If no barrier is given the op is assumed to be synchronous otherwise the op will trigger a commit/arrive on the given barrier.
         If there is a barrier the result will be safe to read after a barrier wait.
     }];
 
diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
@@ -580,8 +580,8 @@ Value addSmemStageToScaleLoad(Value scale, mlir::PatternRewriter &rewriter) {
       loadConsumer = cvt;
     } else {
       // Unrecognized pattern, bail out. In practice, this implies that MMA
-      // pipelining will not apply to the scaled dot op, since tmem_copy would
-      // not be inserted before the pipeline pass.
+      // pipelining will not apply to the scaled dot op, since scales will not
+      // be in passed through SMEM to tc_gen5_mma_scaled.
       return scale;
     }
   }
diff --git a/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp b/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp
@@ -141,32 +141,53 @@ class FuseTransMMAV3Plus : public OpRewritePattern<LocalAllocOp> {
 
 // Inject TMEM copy instructions into IR to efficiently load blocked scales for
 // scaled dot
-class InjectTMemCopy
-    : public OpRewritePattern<triton::nvidia_gpu::TMEMAllocOp> {
+class UseShmemForScales
+    : public OpRewritePattern<triton::nvidia_gpu::TCGen5MMAScaledOp> {
 public:
-  using OpRewritePattern<triton::nvidia_gpu::TMEMAllocOp>::OpRewritePattern;
+  using OpRewritePattern<
+      triton::nvidia_gpu::TCGen5MMAScaledOp>::OpRewritePattern;
 
-  LogicalResult matchAndRewrite(triton::nvidia_gpu::TMEMAllocOp tmemAlloc,
+  LogicalResult matchAndRewrite(triton::nvidia_gpu::TCGen5MMAScaledOp mmaOp,
                                 PatternRewriter &rewriter) const override {
-    auto dstType = tmemAlloc.getResult().getType();
+    auto aScale = mmaOp.getAScale();
+    auto bScale = mmaOp.getBScale();
+    LogicalResult ret = failure();
+    if (aScale && isa<triton::nvidia_gpu::TensorMemoryScalesEncodingAttr>(
+                      aScale.getType().getEncoding())) {
+      if (rewriteOperand(mmaOp.getAScaleMutable(), rewriter).succeeded())
+        ret = success();
+    }
+    if (bScale && isa<triton::nvidia_gpu::TensorMemoryScalesEncodingAttr>(
+                      bScale.getType().getEncoding())) {
+      if (rewriteOperand(mmaOp.getBScaleMutable(), rewriter).succeeded())
+        ret = success();
+    }
+    return ret;
+  }
 
-    // Only applies to TMEMAlloc with scales encoding
-    if (!isa<triton::nvidia_gpu::TensorMemoryScalesEncodingAttr>(
-            dstType.getEncoding())) {
+private:
+  LogicalResult rewriteOperand(OpOperand &opOperand,
+                               PatternRewriter &rewriter) const {
+    auto src = cast<TypedValue<MemDescType>>(opOperand.get());
+    auto tmemAlloc = src.getDefiningOp<triton::nvidia_gpu::TMEMAllocOp>();
+    if (!tmemAlloc) {
       return failure();
     }
+    auto dstType = tmemAlloc.getResult().getType();
 
     if (!tmemAlloc.getSrc()) {
       return failure();
     }
 
     // Look for a sequence
     //    local_load
-    // -> reshape(..., (BLOCK_MN / 128, BLOCK_K / scale_vec_size / 4, 32, 4, 4)
+    // -> reshape(..., (BLOCK_MN / 128, BLOCK_K / scale_vec_size / 4, 32, 4,
+    // 4)
     // -> transpose(..., (0, 3, 2, 1, 4))
     // -> reshape(..., (BLOCK_MN, BLOCK_K / scale_vec_size)
     // -> tmem_alloc
-    // and replace it with tmem_alloc -> tmem_copy
+    // -> tc_gen_mma_scaled
+    // and replace it with local_alloc -> tc_gen_mma_scaled
     auto scale2DShape = dstType.getShape();
     auto blockMN = scale2DShape[0];
     auto numScales = scale2DShape[1];
@@ -195,24 +216,10 @@ class InjectTMemCopy
     if (!localLoad || !isTmemCopyCompatible(localLoad.getSrc().getType())) {
       return failure();
     }
-    MemDescType newType = MemDescType::get(
-        dstType.getShape(), dstType.getElementType(), dstType.getEncoding(),
-        dstType.getMemorySpace(), /*mutableMemory=*/true);
-    Value newTmemAlloc = rewriter.create<triton::nvidia_gpu::TMEMAllocOp>(
-        tmemAlloc.getLoc(), newType, Value());
-
-    // Since tcgen05.cp followed by tcgen05.mma is guaranteed to execute in that
-    // order, we do not need to wait for the completion of the copy before MMA.
-    rewriter.create<triton::nvidia_gpu::TMEMCopyOp>(
-        newTmemAlloc.getLoc(), localLoad.getSrc(), newTmemAlloc,
-        Value() /* barrier */);
-
-    rewriter.replaceOp(tmemAlloc, newTmemAlloc);
-
+    opOperand.assign(localLoad.getSrc());
     return success();
   }
 
-private:
   template <typename Op> Op getNextOp(Value op) const {
     while (auto cvtOp = op.getDefiningOp<ConvertLayoutOp>()) {
       op = cvtOp.getSrc();
@@ -285,7 +292,7 @@ class TritonGPUOptimizeDotOperandsPass
     mlir::RewritePatternSet patterns(context);
     patterns.add<SwizzleShmemConvert>(context);
     patterns.add<FuseTransMMAV3Plus>(context);
-    patterns.add<InjectTMemCopy>(context);
+    patterns.add<UseShmemForScales>(context);
     ConvertLayoutOp::getCanonicalizationPatterns(patterns, context);
     if (failed(applyPatternsGreedily(m, std::move(patterns))))
       signalPassFailure();
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/LowerLoops.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/LowerLoops.cpp
@@ -560,7 +560,7 @@ bool loadRequiresAdditionalBuffer(Operation *loadOp) {
     ttg::LocalAllocOp alloc =
         dyn_cast<ttg::LocalAllocOp>(*loadOp->getUsers().begin());
     if (alloc && alloc->hasOneUse()) {
-      if (isa<ttng::TMEMCopyOp>(*alloc->getUsers().begin())) {
+      if (isa<ttng::TCGen5MMAScaledOp>(*alloc->getUsers().begin())) {
         return true;
       }
     }
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/TC05MMAPipeline.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/TC05MMAPipeline.cpp
@@ -571,18 +571,14 @@ void createBarrierAndWaitOps(IRRewriter &builder, scf::ForOp forOp,
 }
 
 bool isSafeToPipeline(ttng::TCGen5MMAScaledOp scaledDot) {
-  auto getNumUsers = [](Value value) {
-    return std::distance(value.user_begin(), value.user_end());
-  };
-
   auto isCopiedByTMEMCopy = [=](Value scale) {
-    if (getNumUsers(scale) != 2) {
-      // MMA and TMEM copy must be the only users
+    if (!scale.hasOneUse()) {
+      // Should be used only by the scaled dot op
       return false;
     }
 
     for (auto user : scale.getUsers()) {
-      if (!isa<ttng::TMEMCopyOp, ttng::TCGen5MMAScaledOp>(user)) {
+      if (!isa<ttng::TCGen5MMAScaledOp>(user)) {
         // If the scale is used by TMEM copy and the only other user is the
         // scaled dot op, MMA pipelining is safe to apply.
         return false;
diff --git a/lib/Dialect/TritonNvidiaGPU/Transforms/MMALowering.cpp b/lib/Dialect/TritonNvidiaGPU/Transforms/MMALowering.cpp
@@ -29,7 +29,7 @@ class SyncMMALowering : public OpRewritePattern<TCGen5MMAOpTy> {
       return failure();
     MLIRContext *ctx = op.getContext();
     Location loc = op.getLoc();
-    Attribute sharedMemorySpace = triton::gpu::SharedMemorySpaceAttr::get(ctx);
+    Attribute sharedMemorySpace = SharedMemorySpaceAttr::get(ctx);
     auto barrierCTALayout = CTALayoutAttr::get(
         /*context=*/ctx, /*CTAsPerCGA=*/{1},
         /*CTASplitNum=*/{1}, /*CTAOrder=*/{0});
@@ -51,6 +51,50 @@ class SyncMMALowering : public OpRewritePattern<TCGen5MMAOpTy> {
   }
 };
 
+struct TCGen5MMAScaleSharedToTmemConversion
+    : public OpRewritePattern<TCGen5MMAScaledOp> {
+  using OpRewritePattern<TCGen5MMAScaledOp>::OpRewritePattern;
+
+  bool lowerScaleToTmem(OpOperand &operand, PatternRewriter &rewriter) const {
+    Location loc = operand.getOwner()->getLoc();
+    MLIRContext *context = operand.getOwner()->getContext();
+    Attribute tensorMemorySpace = TensorMemorySpaceAttr::get(context);
+    auto oldType = cast<MemDescType>(operand.get().getType());
+    Type elType = oldType.getElementType();
+    SwizzledSharedEncodingAttr oldEncoding =
+        cast<SwizzledSharedEncodingAttr>(oldType.getEncoding());
+    CTALayoutAttr CTALayout = getCTALayout(oldEncoding);
+    ArrayRef<unsigned> CTASplitNum = CTALayout.getCTASplitNum();
+    ArrayRef<int64_t> shape = oldType.getAllocShape();
+    Attribute scaleEncoding = TensorMemoryScalesEncodingAttr::get(
+        context, CTASplitNum[0], CTASplitNum[1]);
+    Type scaleAType =
+        MemDescType::get(shape, elType, scaleEncoding, tensorMemorySpace,
+                         /*mutableMemory=*/true);
+    auto tmemAlloc = rewriter.create<TMEMAllocOp>(loc, scaleAType, Value());
+    rewriter.create<TMEMCopyOp>(loc, operand.get(), tmemAlloc,
+                                /*barrier*/ Value());
+    operand.set(tmemAlloc);
+    return true;
+  }
+
+  LogicalResult matchAndRewrite(TCGen5MMAScaledOp op,
+                                PatternRewriter &rewriter) const override {
+    Location loc = op.getLoc();
+    MLIRContext *context = op->getContext();
+    auto aScaleType = op.getAScale().getType();
+    auto bScaleType = op.getBScale().getType();
+    bool anyChanged = false;
+    if (isa<SwizzledSharedEncodingAttr>(aScaleType.getEncoding())) {
+      anyChanged = lowerScaleToTmem(op.getAScaleMutable(), rewriter);
+    }
+    if (isa<SwizzledSharedEncodingAttr>(bScaleType.getEncoding())) {
+      anyChanged = lowerScaleToTmem(op.getBScaleMutable(), rewriter);
+    }
+    return LogicalResult::success(anyChanged);
+  }
+};
+
 class TritonNvidiaGPUMMALoweringPass
     : public TritonNvidiaGPUMMALoweringPassBase<
           TritonNvidiaGPUMMALoweringPass> {
@@ -61,8 +105,8 @@ class TritonNvidiaGPUMMALoweringPass
 
     mlir::RewritePatternSet patterns(context);
     patterns
-        .add<SyncMMALowering<TCGen5MMAOp>, SyncMMALowering<TCGen5MMAScaledOp>>(
-            context);
+        .add<SyncMMALowering<TCGen5MMAOp>, SyncMMALowering<TCGen5MMAScaledOp>,
+             TCGen5MMAScaleSharedToTmemConversion>(context);
     if (applyPatternsGreedily(m, std::move(patterns)).failed())
       signalPassFailure();
   }
diff --git a/python/test/unit/language/test_matmul.py b/python/test/unit/language/test_matmul.py
@@ -484,6 +484,7 @@ def test_blocked_scale_mxfp(M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, NUM_STAGES, USE_
                                         b.stride(1), output.stride(0), output.stride(1), BLOCK_M, BLOCK_N, BLOCK_K,
                                         NUM_STAGES=NUM_STAGES, USE_2D_SCALE_LOAD=USE_2D_SCALE_LOAD)
     ttgir = out.asm["ttgir"]
+    ptx = out.asm["ptx"]
 
     def flatten_scale(scale):
         num_chunk_m, num_chunk_k, _, _, _ = scale.shape
@@ -508,8 +509,7 @@ def flatten_scale(scale):
     if USE_2D_SCALE_LOAD:
         # Due to an issue in the coalescing pass, tmem_copy can not be generated for the 5D load.
         # The issue is fixed using the patch from https://github.com/triton-lang/triton/pull/4914
-        assert "tmem_copy" in ttgir
-
+        assert "tcgen05.cp" in ptx
     if NUM_STAGES > 1:
         if BLOCK_M == BLOCK_K and BLOCK_N == BLOCK_K:
             load_pipelined = ttgir.count(f"ttg.local_alloc  : () -> !ttg.memdesc<{NUM_STAGES}x{BLOCK_M}x{BLOCK_K}") == 2
diff --git a/test/TritonGPU/dot-operands.mlir b/test/TritonGPU/dot-operands.mlir
@@ -95,27 +95,44 @@ module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-
 
 // -----
 
+#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 16}>
 #shared1 = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
 #smem = #ttg.shared_memory
 #blocked4 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 32], warpsPerCTA = [2, 2], order = [1, 0]}>
 #blocked8 = #ttg.blocked<{sizePerThread = [1, 1, 1, 2, 4], threadsPerWarp = [1, 1, 16, 2, 1], warpsPerCTA = [2, 1, 2, 1, 1], order = [4, 3, 2, 1, 0]}>
 #blocked9 = #ttg.blocked<{sizePerThread = [1, 2, 1, 1, 4], threadsPerWarp = [1, 2, 16, 1, 1], warpsPerCTA = [2, 1, 2, 1, 1], order = [4, 1, 2, 3, 0]}>
 #blocked10 = #ttg.blocked<{sizePerThread = [1, 1, 1, 1, 4], threadsPerWarp = [1, 1, 32, 1, 1], warpsPerCTA = [1, 1, 1, 1, 4], order = [4, 3, 2, 1, 0]}>
 #blocked11 = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [32, 1], warpsPerCTA = [1, 4], order = [1, 0]}>
+#tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, unpacked = true>
 #tmem_scales = #ttng.tensor_memory_scales_encoding<>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:100", "ttg.threads-per-warp" = 32 : i32} {
-  // CHECK-LABEL: @inject_tmem_copy
-  // CHECK:   ttng.tmem_alloc {{.*}}, mutable
-  // CHECK:   ttng.tmem_copy
+  // CHECK-LABEL: @scales_in_shmem
+  // CHECK: %[[A_LA:.*]] = ttg.local_alloc
+  // CHECK: %[[B_LA:.*]] = ttg.local_alloc
+  // CHECK: ttng.tc_gen5_mma_scaled {{.*}}, %[[A_LA]], %[[B_LA]],
 
-  tt.func public @inject_tmem_copy(%scale: tensor<2x512x!tt.ptr<i8>, #blocked4> {tt.contiguity = 16 : i32, tt.divisibility = 16 : i32}) attributes {noinline = false} {
-      %75 = ttg.local_alloc  : () -> !ttg.memdesc<2x512xi8, #shared1, #smem, mutable>
-      %180 = ttg.local_load %75 : !ttg.memdesc<2x512xi8, #shared1, #smem, mutable, 3x2x512> -> tensor<2x512xi8, #blocked4>
-      %183 = tt.reshape %180 : tensor<2x512xi8, #blocked4> -> tensor<2x1x32x4x4xi8, #blocked8>
-      %184 = tt.trans %183 {order = array<i32: 0, 3, 2, 1, 4>} : tensor<2x1x32x4x4xi8, #blocked8> -> tensor<2x4x32x1x4xi8, #blocked9>
-      %187 = ttg.convert_layout %184 : tensor<2x4x32x1x4xi8, #blocked9> -> tensor<2x4x32x1x4xi8, #blocked10>
-      %188 = tt.reshape %187 : tensor<2x4x32x1x4xi8, #blocked10> -> tensor<256x4xi8, #blocked11>
-      %190 = ttng.tmem_alloc %188 : (tensor<256x4xi8, #blocked11>) -> !ttg.memdesc<256x4xi8, #tmem_scales, #ttng.tensor_memory>
+  tt.func public @scales_in_shmem(
+    %scale: tensor<2x512x!tt.ptr<i8>, #blocked4> {tt.contiguity = 16 : i32, tt.divisibility = 16 : i32},
+    %A_sh: !ttg.memdesc<128x128xf8E5M2, #shared, #ttg.shared_memory>,
+    %B_sh: !ttg.memdesc<128x128xf8E5M2, #shared, #ttg.shared_memory>,
+    %acc_tm: !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory>
+    ) attributes {noinline = false} {
+      %true = arith.constant true
+      %A_la = ttg.local_alloc  : () -> !ttg.memdesc<2x512xi8, #shared1, #smem, mutable>
+      %B_la = ttg.local_alloc  : () -> !ttg.memdesc<2x512xi8, #shared1, #smem, mutable>
+      %A_ll = ttg.local_load %A_la : !ttg.memdesc<2x512xi8, #shared1, #smem, mutable, 3x2x512> -> tensor<2x512xi8, #blocked4>
+      %B_ll = ttg.local_load %B_la : !ttg.memdesc<2x512xi8, #shared1, #smem, mutable, 3x2x512> -> tensor<2x512xi8, #blocked4>
+      %A_r = tt.reshape %A_ll : tensor<2x512xi8, #blocked4> -> tensor<2x1x32x4x4xi8, #blocked8>
+      %B_r = tt.reshape %B_ll : tensor<2x512xi8, #blocked4> -> tensor<2x1x32x4x4xi8, #blocked8>
+      %A_tr = tt.trans %A_r {order = array<i32: 0, 3, 2, 1, 4>} : tensor<2x1x32x4x4xi8, #blocked8> -> tensor<2x4x32x1x4xi8, #blocked9>
+      %B_tr = tt.trans %B_r {order = array<i32: 0, 3, 2, 1, 4>} : tensor<2x1x32x4x4xi8, #blocked8> -> tensor<2x4x32x1x4xi8, #blocked9>
+      %A_cv = ttg.convert_layout %A_tr : tensor<2x4x32x1x4xi8, #blocked9> -> tensor<2x4x32x1x4xi8, #blocked10>
+      %B_cv = ttg.convert_layout %B_tr : tensor<2x4x32x1x4xi8, #blocked9> -> tensor<2x4x32x1x4xi8, #blocked10>
+      %A_r2 = tt.reshape %A_cv : tensor<2x4x32x1x4xi8, #blocked10> -> tensor<256x4xi8, #blocked11>
+      %B_r2 = tt.reshape %B_cv : tensor<2x4x32x1x4xi8, #blocked10> -> tensor<256x4xi8, #blocked11>
+      %A_tm = ttng.tmem_alloc %A_r2 : (tensor<256x4xi8, #blocked11>) -> !ttg.memdesc<256x4xi8, #tmem_scales, #ttng.tensor_memory>
+      %B_tm = ttng.tmem_alloc %B_r2 : (tensor<256x4xi8, #blocked11>) -> !ttg.memdesc<256x4xi8, #tmem_scales, #ttng.tensor_memory>
+      ttng.tc_gen5_mma_scaled %A_sh, %B_sh, %acc_tm, %A_tm, %B_tm, %true, %true lhs = e5m2 rhs = e5m2 {loop.cluster = 0 : i32, loop.stage = 2 : i32} : (!ttg.memdesc<128x128xf8E5M2, #shared, #ttg.shared_memory>, !ttg.memdesc<128x128xf8E5M2, #shared, #ttg.shared_memory>, !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory>, !ttg.memdesc<256x4xi8, #tmem_scales, #ttng.tensor_memory>, !ttg.memdesc<256x4xi8, #tmem_scales, #ttng.tensor_memory>, i1, i1) -> ()
       tt.return
 }
 }
diff --git a/test/TritonGPU/loop-pipeline-blackwell.mlir b/test/TritonGPU/loop-pipeline-blackwell.mlir
@@ -384,16 +384,11 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
       %122 = tt.load %arg19 : tensor<1x2x32x4x4x!tt.ptr<i8>, #blocked2>
 
       %137 = ttg.local_alloc %121 : (tensor<1x2x32x4x4xi8, #blocked2>) -> !ttg.memdesc<1x2x32x4x4xi8, #shared1, #smem>
-      %130 = ttng.tmem_alloc  : () -> !ttg.memdesc<128x8xi8, #tmem_scales, #ttng.tensor_memory, mutable>
-      ttng.tmem_copy %137, %130,  : (!ttg.memdesc<1x2x32x4x4xi8, #shared1, #smem>, !ttg.memdesc<128x8xi8, #tmem_scales, #ttng.tensor_memory, mutable>) -> ()
-
       %139 = ttg.local_alloc %122 : (tensor<1x2x32x4x4xi8, #blocked2>) -> !ttg.memdesc<1x2x32x4x4xi8, #shared1, #smem>
-      %131 = ttng.tmem_alloc  : () -> !ttg.memdesc<128x8xi8, #tmem_scales, #ttng.tensor_memory, mutable>
-      ttng.tmem_copy %139, %131,  : (!ttg.memdesc<1x2x32x4x4xi8, #shared1, #smem>, !ttg.memdesc<128x8xi8, #tmem_scales, #ttng.tensor_memory, mutable>) -> ()
 
       %127 = ttng.tmem_alloc %arg15 : (tensor<128x128xf32, #blocked4>) -> !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>
 
-      ttng.tc_gen5_mma_scaled %118, %120, %127, %130, %131, %true, %true lhs = e5m2 rhs = e5m2 : (!ttg.memdesc<128x256xf8E5M2, #shared, #ttg.shared_memory>, !ttg.memdesc<256x128xf8E5M2, #shared, #ttg.shared_memory>, !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>, !ttg.memdesc<128x8xi8, #ttng.tensor_memory_scales_encoding<>, #ttng.tensor_memory, mutable>, !ttg.memdesc<128x8xi8, #ttng.tensor_memory_scales_encoding<>, #ttng.tensor_memory, mutable>, i1, i1) -> ()
+      ttng.tc_gen5_mma_scaled %118, %120, %127, %137, %139, %true, %true lhs = e5m2 rhs = e5m2 : (!ttg.memdesc<128x256xf8E5M2, #shared, #ttg.shared_memory>, !ttg.memdesc<256x128xf8E5M2, #shared, #ttg.shared_memory>, !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>, !ttg.memdesc<1x2x32x4x4xi8, #shared1, #smem>, !ttg.memdesc<1x2x32x4x4xi8, #shared1, #smem>, i1, i1) -> ()
       %132 = ttng.tmem_load %127 : !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable> -> tensor<128x128xf32, #blocked4>
 
       %133 = tt.addptr %arg16, %incr_A : tensor<128x256x!tt.ptr<f8E5M2>, #blocked>, tensor<128x256xi32, #blocked>
diff --git a/test/TritonGPU/mma-pipeline-blackwell.mlir b/test/TritonGPU/mma-pipeline-blackwell.mlir
diff --git a/test/TritonGPU/pipeline-lower-loop.mlir b/test/TritonGPU/pipeline-lower-loop.mlir
diff --git a/test/TritonNvidiaGPU/mma_lowering.mlir b/test/TritonNvidiaGPU/mma_lowering.mlir

Original file line number	Diff line number	Diff line change
`@@ -580,8 +580,8 @@ Value addSmemStageToScaleLoad(Value scale, mlir::PatternRewriter &rewriter) {`
`580`	`580`	`loadConsumer = cvt;`
`581`	`581`	`} else {`
`582`	`582`	`// Unrecognized pattern, bail out. In practice, this implies that MMA`
`583`		`- // pipelining will not apply to the scaled dot op, since tmem_copy would`
`584`		`- // not be inserted before the pipeline pass.`
	`583`	`+ // pipelining will not apply to the scaled dot op, since scales will not`
	`584`	`+ // be in passed through SMEM to tc_gen5_mma_scaled.`
`585`	`585`	`return scale;`
`586`	`586`	`}`
`587`	`587`	`}`
Original file line number	Diff line number	Diff line change
`@@ -560,7 +560,7 @@ bool loadRequiresAdditionalBuffer(Operation *loadOp) {`
`560`	`560`	`ttg::LocalAllocOp alloc =`
`561`	`561`	`dyn_cast<ttg::LocalAllocOp>(*loadOp->getUsers().begin());`
`562`	`562`	`if (alloc && alloc->hasOneUse()) {`
`563`		`- if (isa<ttng::TMEMCopyOp>(*alloc->getUsers().begin())) {`
	`563`	`+ if (isa<ttng::TCGen5MMAScaledOp>(*alloc->getUsers().begin())) {`
`564`	`564`	`return true;`
`565`	`565`	`}`
`566`	`566`	`}`