[AMD] Support scale is none in DotScaledOp for gfx950 (#5931)

knwng · web-flow · commit 40c183870879 · 2025-02-19T19:55:23.000Z
This PR supported the case when one or two scales are None in
DotScaledOp in gfx950. If scale is None, a constant scale tensor
 with value 1.0 will be created.
diff --git a/python/test/unit/language/test_matmul.py b/python/test/unit/language/test_matmul.py
@@ -682,8 +682,10 @@ def block_scale_fp4_matmul(  #
     # Two e2m1 values per K
     offs_k = tl.arange(0, BLOCK_K // 2)
     offs_scale_k = tl.arange(0, BLOCK_K // VEC_SIZE)
-    a_scale_ptr = a_scale + offs_am[:, None] * stride_scale + offs_scale_k[None, :]
-    b_scale_ptr = b_scale + offs_bn[:, None] * stride_scale + offs_scale_k[None, :]
+    if a_scale is not None:
+        a_scale_ptr = a_scale + offs_am[:, None] * stride_scale + offs_scale_k[None, :]
+    if b_scale is not None:
+        b_scale_ptr = b_scale + offs_bn[:, None] * stride_scale + offs_scale_k[None, :]
     a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)
     b_ptrs = b_ptr + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
     accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=output_ptr.dtype.element_ty)
@@ -692,13 +694,21 @@ def block_scale_fp4_matmul(  #
         valid_k = offs_k < k_remaining
         a = tl.load(a_ptrs, mask=valid_k[None, :], other=0)
         b = tl.load(b_ptrs, mask=valid_k[:, None], other=0)
-        scale_a = tl.load(a_scale_ptr)
-        scale_b = tl.load(b_scale_ptr)
+        if a_scale is not None:
+            scale_a = tl.load(a_scale_ptr)
+        else:
+            scale_a = None
+        if b_scale is not None:
+            scale_b = tl.load(b_scale_ptr)
+        else:
+            scale_b = None
         accumulator = tl.dot_scaled(a, scale_a, "e2m1", b, scale_b, "e2m1", accumulator)
         a_ptrs += (BLOCK_K // 2) * stride_ak
         b_ptrs += (BLOCK_K // 2) * stride_bk
-        a_scale_ptr += BLOCK_K // VEC_SIZE
-        b_scale_ptr += BLOCK_K // VEC_SIZE
+        if a_scale is not None:
+            a_scale_ptr += BLOCK_K // VEC_SIZE
+        if b_scale is not None:
+            b_scale_ptr += BLOCK_K // VEC_SIZE
     offs_cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
     offs_cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
     output_ptrs = output_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
@@ -709,12 +719,18 @@ def block_scale_fp4_matmul(  #
 @pytest.mark.parametrize("M, N, K", [(1024, 512, 256), (2, 4, 64)])
 @pytest.mark.parametrize("BLOCK_M, BLOCK_N, BLOCK_K", [(128, 128, 128), (256, 128, 128), (128, 256, 128),
                                                        (128, 256, 256), (128, 128, 64), (128, 64, 128)])
+@pytest.mark.parametrize("with_a_scale", [True, False])
+@pytest.mark.parametrize("with_b_scale", [True, False])
 @pytest.mark.parametrize(("scale_type", "VEC_SIZE"), [("float8_e8m0fnu", 32), ("float8_e4m3fn", 16)],
                          ids=["mxfp4", "nvfp4"])
 @pytest.mark.parametrize("nonKDim", ([0, 16, 32] if is_hip_cdna() else []))
-def test_block_scale_fp4(M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, VEC_SIZE, scale_type, nonKDim, device):
-    if is_cuda() and torch.cuda.get_device_capability()[0] < 10:
-        pytest.skip("Requires compute capability >= 10")
+def test_block_scale_fp4(M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, VEC_SIZE, with_a_scale, with_b_scale, scale_type, nonKDim,
+                         device):
+    if is_cuda():
+        if torch.cuda.get_device_capability()[0] < 10:
+            pytest.skip("Requires compute capability >= 10")
+        if not (with_a_scale and with_b_scale):
+            pytest.skip("None aScale/bScale is only tested on AMD backend for now")
     elif is_hip():
         if not is_hip_mi350():
             pytest.skip("Scaled fp4 matmul is only natively supported on MI350")
@@ -750,14 +766,21 @@ def test_block_scale_fp4(M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, VEC_SIZE, scale_typ
 
     a_scale_ref = a_scale_ref.to(torch.float32).repeat_interleave(VEC_SIZE, dim=1)[:M, :K]
     b_scale_ref = b_scale_ref.to(torch.float32).repeat_interleave(VEC_SIZE, dim=1).T.contiguous()[:K, :N]
+    stride_scale = a_scale.stride(0)
+    if not with_a_scale:
+        a_scale = None
+        a_scale_ref = 1.0
+    if not with_b_scale:
+        b_scale = None
+        b_scale_ref = 1.0
     ref_out = torch.matmul(a_mxfp4.to(torch.float32) * a_scale_ref, b_ref * b_scale_ref)
 
     output = a.new_empty((M, N), dtype=torch.float32)
     grid = (triton.cdiv(M, BLOCK_M) * triton.cdiv(N, BLOCK_N), 1)
     kernel_kwargs = {}
     if is_hip():
         kernel_kwargs["matrix_instr_nonkdim"] = nonKDim
-    block_scale_fp4_matmul[grid](a, b, output, a_scale, b_scale, M, N, K, a_scale.stride(0), a.stride(0), a.stride(1),
+    block_scale_fp4_matmul[grid](a, b, output, a_scale, b_scale, M, N, K, stride_scale, a.stride(0), a.stride(1),
                                  b.stride(0), b.stride(1), output.stride(0), output.stride(1), VEC_SIZE, BLOCK_M,
                                  BLOCK_N, BLOCK_K, NUM_STAGES=NUM_STAGES, **kernel_kwargs)
 
diff --git a/python/triton/language/semantic.py b/python/triton/language/semantic.py
@@ -1649,8 +1649,8 @@ def dot_scaled(lhs: tl.tensor, lhs_scale: tl.tensor, lhs_format: str, rhs: tl.te
     allowed_formats = {"e2m1", "e4m3", "e5m2", "bf16", "fp16"}
     assert lhs_format in allowed_formats, f"NYI: lhs_format {lhs_format}"
     assert rhs_format in allowed_formats, f"NYI: rhs_format {rhs_format}"
-    rhs_scale_is_none = isinstance(rhs_scale, tl.constexpr) and rhs_scale.value is None
-    lhs_scale_is_none = isinstance(lhs_scale, tl.constexpr) and lhs_scale.value is None
+    rhs_scale_is_none = rhs_scale is None or (isinstance(rhs_scale, tl.constexpr) and rhs_scale.value is None)
+    lhs_scale_is_none = lhs_scale is None or (isinstance(lhs_scale, tl.constexpr) and lhs_scale.value is None)
     lhs = _bitcast_to_fp_type(lhs, lhs_format, builder)
     rhs = _bitcast_to_fp_type(rhs, rhs_format, builder)
 
diff --git a/test/TritonGPU/amd/accelerate-amd-matmul-mfma-gfx950.mlir b/test/TritonGPU/amd/accelerate-amd-matmul-mfma-gfx950.mlir
@@ -0,0 +1,94 @@
+// RUN: triton-opt %s -split-input-file --tritonamdgpu-accelerate-matmul='arch-generation-name=gfx950 matrix-instruction-size=0' | FileCheck %s --check-prefixes CHECK
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [16, 4], warpsPerCTA = [4, 1], order = [1, 0]}>
+// CHECK{LITERAL}: #ttg.linear<{register = [[0, 2], [64, 0]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [16, 0], [0, 1]], warp = [[0, 0], [32, 0]], block = []}>
+// CHECK{LITERAL}: #ttg.linear<{register = [[0, 2], [64, 0]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [16, 0], [0, 1]], warp = [[32, 0], [0, 0]], block = []}>
+// CHECK-LABEL: mfma_dot_scaled_mxfp4_mxfp4
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @mfma_dot_scaled_mxfp4_mxfp4(
+      %arg0: tensor<128x64xi8, #ttg.dot_op<{opIdx = 0, parent = #blocked}>>,
+      %arg1: tensor<64x128xi8, #ttg.dot_op<{opIdx = 1, parent = #blocked}>>,
+      %arg2: tensor<128x4xi8>,
+      %arg3: tensor<128x4xi8>,
+      %arg4: tensor<128x128x!tt.ptr<f32>, #blocked>
+      ) {
+    // CHECK-NOT: arith.constant dense<127> : tensor<128x4xi8, #linear>
+    // CHECK-NOT: arith.constant dense<127> : tensor<128x4xi8, #linear1>
+    // CHECK-NOT: tt.fp_to_fp
+    // CHECK: %[[C:.+]] = ttg.convert_layout {{.*}} : tensor<128x128xf32, #blocked> -> tensor<128x128xf32, #mma>
+    // CHECK: %[[A:.+]] = ttg.convert_layout {{.*}} : tensor<128x64xi8, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> -> tensor<128x64xi8, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 16}>>
+    // CHECK: %[[B:.+]] = ttg.convert_layout {{.*}} : tensor<64x128xi8, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<64x128xi8, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 16}>>
+    // CHECK: %[[SCALE0:.+]] = ttg.convert_layout {{.*}} : {{.*}} -> tensor<128x4xi8, #linear>
+    // CHECK: %[[SCALE1:.+]] = ttg.convert_layout {{.*}} : {{.*}} -> tensor<128x4xi8, #linear1>
+    // CHECK: tt.dot_scaled %[[A]] scale %[[SCALE0]], %[[B]] scale %[[SCALE1]], %[[C]] lhs = e2m1 rhs = e2m1
+    %cst = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #blocked>
+    %1 = tt.dot_scaled %arg0 scale %arg2, %arg1 scale %arg3, %cst lhs = e2m1 rhs = e2m1 {fastMath = false} : tensor<128x64xi8, #ttg.dot_op<{opIdx = 0, parent = #blocked}>>, tensor<128x4xi8> * tensor<64x128xi8, #ttg.dot_op<{opIdx = 1, parent = #blocked}>>, tensor<128x4xi8> -> tensor<128x128xf32, #blocked>
+    tt.store %arg4, %1 : tensor<128x128x!tt.ptr<f32>, #blocked>
+    tt.return
+  }
+}
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [16, 4], warpsPerCTA = [4, 1], order = [1, 0]}>
+// CHECK-LABEL: mfma_dot_scaled_mxfp4_fp4
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @mfma_dot_scaled_mxfp4_fp4(
+      %arg0: tensor<128x64xi8, #ttg.dot_op<{opIdx = 0, parent = #blocked}>>,
+      %arg1: tensor<64x128xi8, #ttg.dot_op<{opIdx = 1, parent = #blocked}>>,
+      %arg2: tensor<128x4xi8>,
+      %arg3: tensor<128x128x!tt.ptr<f32>, #blocked>
+      ) {
+    // CHECK-NOT: tt.fp_to_fp
+    // CHECK: %[[CST1:.+]] = arith.constant dense<127> : tensor<128x4xi8, #linear>
+    // CHECK: %[[SCALE0:.+]] = ttg.convert_layout {{.*}} : {{.*}} -> tensor<128x4xi8, #linear1>
+    // CHECK: tt.dot_scaled {{.*}} scale %[[SCALE0]], {{.*}} scale %[[CST1]], {{.*}} lhs = e2m1 rhs = e2m1
+    %cst = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #blocked>
+    %1 = tt.dot_scaled %arg0 scale %arg2, %arg1, %cst lhs = e2m1 rhs = e2m1 {fastMath = false} : tensor<128x64xi8, #ttg.dot_op<{opIdx = 0, parent = #blocked}>>, tensor<128x4xi8> * tensor<64x128xi8, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<128x128xf32, #blocked>
+    tt.store %arg3, %1 : tensor<128x128x!tt.ptr<f32>, #blocked>
+    tt.return
+  }
+}
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [16, 4], warpsPerCTA = [4, 1], order = [1, 0]}>
+// CHECK-LABEL: mfma_dot_scaled_fp4_mxfp4
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @mfma_dot_scaled_fp4_mxfp4(
+      %arg0: tensor<128x64xi8, #ttg.dot_op<{opIdx = 0, parent = #blocked}>>,
+      %arg1: tensor<64x128xi8, #ttg.dot_op<{opIdx = 1, parent = #blocked}>>,
+      %arg2: tensor<128x4xi8>,
+      %arg3: tensor<128x128x!tt.ptr<f32>, #blocked>
+      ) {
+    // CHECK-NOT: tt.fp_to_fp
+    // CHECK: %[[CST0:.+]] = arith.constant dense<127> : tensor<128x4xi8, #linear>
+    // CHECK: %[[SCALE1:.+]] = ttg.convert_layout {{.*}} : {{.*}} -> tensor<128x4xi8, #linear1>
+    // CHECK: tt.dot_scaled {{.*}} scale %[[CST0]], {{.*}} scale %[[SCALE1]], {{.*}} lhs = e2m1 rhs = e2m1
+    %cst = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #blocked>
+    %1 = tt.dot_scaled %arg0, %arg1 scale %arg2, %cst lhs = e2m1 rhs = e2m1 {fastMath = false} : tensor<128x64xi8, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> * tensor<64x128xi8, #ttg.dot_op<{opIdx = 1, parent = #blocked}>>, tensor<128x4xi8> -> tensor<128x128xf32, #blocked>
+    tt.store %arg3, %1 : tensor<128x128x!tt.ptr<f32>, #blocked>
+    tt.return
+  }
+}
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [16, 4], warpsPerCTA = [4, 1], order = [1, 0]}>
+// CHECK-LABEL: mfma_dot_scaled_fp4_fp4
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @mfma_dot_scaled_fp4_fp4(
+      %arg0: tensor<128x64xi8, #ttg.dot_op<{opIdx = 0, parent = #blocked}>>,
+      %arg1: tensor<64x128xi8, #ttg.dot_op<{opIdx = 1, parent = #blocked}>>,
+      %arg2: tensor<128x128x!tt.ptr<f32>, #blocked>
+      ) {
+    // CHECK-NOT: tt.fp_to_fp
+    // CHECK-DAG: %[[CST0:.+]] = arith.constant dense<127> : tensor<128x4xi8, #linear>
+    // CHECK-DAG: %[[CST1:.+]] = arith.constant dense<127> : tensor<128x4xi8, #linear1>
+    // CHECK: tt.dot_scaled {{.*}} scale %[[CST1]], {{.*}} scale %[[CST0]], {{.*}} lhs = e2m1 rhs = e2m1
+    %cst = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #blocked>
+    %1 = tt.dot_scaled %arg0, %arg1, %cst lhs = e2m1 rhs = e2m1 {fastMath = false} : tensor<128x64xi8, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> * tensor<64x128xi8, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<128x128xf32, #blocked>
+    tt.store %arg2, %1 : tensor<128x128x!tt.ptr<f32>, #blocked>
+    tt.return
+  }
+}
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM/MFMA.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM/MFMA.cpp
@@ -372,7 +372,8 @@ struct DotOpMFMAConversionHelper {
   /// rawElems is a vector of kWidth elements. We need to prepare vector(s) of
   /// kBase elements for each mfma instruction
   SmallVector<Value> extractOperands(Value rawElems, int kWidth, int kBase,
-                                     Type type, bool preserveBF16) const {
+                                     Type type, bool preserveBF16,
+                                     bool isConstantScale = false) const {
     auto b = TritonLLVMOpBuilder(loc, rewriter);
     int kpack = kWidth / kBase;
     SmallVector<Value> results;
@@ -393,9 +394,20 @@ struct DotOpMFMAConversionHelper {
         }
       }
       if (type.getIntOrFloatBitWidth() == 8) {
-        if (1 == kBase)
+        if (1 == kBase) {
           // This is only for the scale operands of scaled mfma on MI350
-          results.push_back(b.zext(i32_ty, b.bitcast(vec, i8_ty)));
+          if (isConstantScale) {
+            // If the scale is constant(created by arith::ConstantOp), it will
+            // be put in a sgpr instead of vgpr. In that case, instead of
+            // vgpr[7:0], the instruction reads sgpr[30:23] as the scale value.
+            // So we need to manually left shift the scale by 23 bits to meet
+            // the requirement.
+            results.push_back(b.shl(
+                i32_ty, b.zext(i32_ty, b.bitcast(vec, i8_ty)), b.i32_val(23)));
+          } else {
+            results.push_back(b.zext(i32_ty, b.bitcast(vec, i8_ty)));
+          }
+        }
         if (4 == kBase)
           // This is for int8 on pre- MI300 GPUs
           results.push_back(b.bitcast(vec, i32_ty));
@@ -413,10 +425,9 @@ struct DotOpMFMAConversionHelper {
 
   /// Converts dot operand structure to value table and converts types
   /// appropriate for mfma instructions
-  virtual SmallVector<ValueTable>
-  getValuesFromDotOperandLayoutStruct(Value value, int batch, int n0, int n1,
-                                      int kWidth, int kBase, Type type,
-                                      bool allowXF32, bool preserveBF16) const {
+  virtual SmallVector<ValueTable> getValuesFromDotOperandLayoutStruct(
+      Value value, int batch, int n0, int n1, int kWidth, int kBase, Type type,
+      bool allowXF32, bool preserveBF16, bool isConstantScale = false) const {
     auto tb = TritonLLVMOpBuilder(loc, rewriter);
     auto elems = unpackLLElements(loc, value, rewriter);
     int kpack = kWidth / kBase;
@@ -445,8 +456,8 @@ struct DotOpMFMAConversionHelper {
               vals = extractOperands(rawElems, kWidth, kBase, f32_ty,
                                      preserveBF16);
             } else if (type.getIntOrFloatBitWidth() == 8) {
-              vals =
-                  extractOperands(rawElems, kWidth, kBase, i8_ty, preserveBF16);
+              vals = extractOperands(rawElems, kWidth, kBase, i8_ty,
+                                     preserveBF16, isConstantScale);
             } else if (type.isBF16()) {
               vals = extractOperands(rawElems, kWidth, kBase, bf16_ty,
                                      preserveBF16);
@@ -506,6 +517,8 @@ struct ScaledDotOpMFMAConversionHelper : DotOpMFMAConversionHelper {
     Value b = op.getRhs();
     Value aScale = op.getLhsScale();
     Value bScale = op.getRhsScale();
+    bool isAScaleConstant = aScale.getDefiningOp<arith::ConstantOp>();
+    bool isBScaleConstant = bScale.getDefiningOp<arith::ConstantOp>();
     Value d = op.getD();
     auto aTensorTy = cast<RankedTensorType>(a.getType());
     auto bTensorTy = cast<RankedTensorType>(b.getType());
@@ -581,10 +594,12 @@ struct ScaledDotOpMFMAConversionHelper : DotOpMFMAConversionHelper {
     // operands.
     auto operandAScale = getValuesFromDotOperandLayoutStruct(
         loadedAScale, numRepB, numRepM, numRepK, scaleKWidth, scaleKBase,
-        aScaleTensorTy.getElementType(), allowXF32, /*preserveBF16=*/false);
+        aScaleTensorTy.getElementType(), allowXF32, /*preserveBF16=*/false,
+        isAScaleConstant);
     auto operandBScale = getValuesFromDotOperandLayoutStruct(
         loadedBScale, numRepB, numRepN, numRepK, scaleKWidth, scaleKBase,
-        bScaleTensorTy.getElementType(), allowXF32, /*preserveBF16=*/false);
+        bScaleTensorTy.getElementType(), allowXF32, /*preserveBF16=*/false,
+        isBScaleConstant);
 
     auto dstElemTy = dTensorTy.getElementType();
     auto fc = unpackLLElements(loc, loadedC, rewriter);
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp
@@ -724,10 +724,6 @@ class ScaledBlockedToScaledMFMAF8F6F4 final
     TensorValue bScale = dotOp.getRhsScale();
     auto oldShape = oldRetType.getShape();
 
-    if (!aScale || !bScale)
-      return rewriter.notifyMatchFailure(dotOp,
-                                         "expect scales for both A and B");
-
     ScaleDotElemType aElemType = dotOp.getLhsType();
     ScaleDotElemType bElemType = dotOp.getRhsType();
     auto supportsTypes = [](ScaleDotElemType elemType) {
@@ -872,14 +868,25 @@ class ScaledBlockedToScaledMFMAF8F6F4 final
 
     auto convertScaleLayout = [&](TensorValue val, TensorValue scale,
                                   DotOperandEncodingAttr enc,
-                                  int idx) -> TensorValue {
-      auto dotLL = enc.toLinearLayout(val.getType().getShape());
+                                  int idx) -> Value {
+      auto valShape = val.getType().getShape();
+
+      auto dotLL = enc.toLinearLayout(valShape);
       LinearLayout::BasesT scaleBases = dotLL.getBases();
       auto &warpBases = scaleBases[kWarp];
 
       LinearLayout newLL = createLinearLayout(idx, warpBases);
 
-      auto shape = scale.getType().getShape();
+      SmallVector<int64_t> shape;
+      if (!scale) {
+        int64_t nonKDim = idx == 0 ? valShape[0] : valShape[1];
+        int64_t k = idx == 0 ? valShape[1] : valShape[0];
+        ScaleDotElemType &elemType = idx == 0 ? aElemType : bElemType;
+        int packSize = elemType == ScaleDotElemType::E2M1 ? 2 : 1;
+        shape = {nonKDim, k * packSize / 32};
+      } else {
+        shape = llvm::to_vector(scale.getType().getShape());
+      }
 
       // Adjust register-level layout to fill the shape, at this level, both
       // aScale and bScale should align with A operand.
@@ -891,18 +898,25 @@ class ScaledBlockedToScaledMFMAF8F6F4 final
       }
       newLL = newLL.transposeOuts(standardOutDims);
       Attribute newScaleEncoding = ttg::LinearEncodingAttr::get(ctx, newLL);
-
-      auto newScaleType = RankedTensorType::get(
-          shape, scale.getType().getElementType(), newScaleEncoding);
-      return rewriter.create<ttg::ConvertLayoutOp>(scale.getLoc(), newScaleType,
-                                                   scale);
+      // Scale's data type is always i8
+      auto newScaleType = RankedTensorType::get(shape, i8_ty, newScaleEncoding);
+
+      if (!scale) {
+        // 0x7F is 1.0 in E8M0
+        return rewriter.create<arith::ConstantOp>(
+            dotOp->getLoc(), newScaleType,
+            DenseElementsAttr::get(newScaleType, llvm::APInt(8, 0x7F)));
+      } else {
+        return rewriter.create<ttg::ConvertLayoutOp>(scale.getLoc(),
+                                                     newScaleType, scale);
+      }
     };
-    aScale = convertScaleLayout(a, aScale, newAEncoding, 0);
-    bScale = convertScaleLayout(b, bScale, newBEncoding, 1);
+    auto newAScale = convertScaleLayout(a, aScale, newAEncoding, 0);
+    auto newBScale = convertScaleLayout(b, bScale, newBEncoding, 1);
 
     auto newDot = rewriter.create<triton::DotScaledOp>(
-        dotOp.getLoc(), newRetType, a, b, newAcc, aScale, bScale, aElemType,
-        bElemType, dotOp.getFastMath());
+        dotOp.getLoc(), newRetType, a, b, newAcc, newAScale, newBScale,
+        aElemType, bElemType, dotOp.getFastMath());
 
     rewriter.replaceOpWithNewOp<ttg::ConvertLayoutOp>(dotOp, oldRetType,
                                                       newDot);