[FRONTEND] Add scales dimension checks for dot_scaled (#8564)

ThomasRaoux · web-flow · commit c186592a1729 · 2025-10-29T16:18:17.000-07:00
diff --git a/include/triton/Dialect/Triton/IR/TritonOps.td b/include/triton/Dialect/Triton/IR/TritonOps.td
@@ -732,6 +732,7 @@ def TT_DotScaledOp : TT_Op<"dot_scaled", [Pure,
       `lhs` `=` $a_elem_type `rhs` `=` $b_elem_type attr-dict
       `:` type($a) (`,` type($a_scale)^)? `*` type($b) (`,` type($b_scale)^)? `->` type($d)
     }];
+    let hasVerifier = 1;
 }
 
 //
diff --git a/lib/Dialect/Triton/IR/Ops.cpp b/lib/Dialect/Triton/IR/Ops.cpp
@@ -356,6 +356,44 @@ bool DotScaledOp::verifyOutputDims() {
   return true;
 }
 
+LogicalResult DotScaledOp::verify() {
+  auto aShape = this->getA().getType().getShape();
+  int64_t rank = aShape.size();
+
+  auto k = aShape[rank - 1];
+  if (this->getAElemType() == ScaleDotElemType::E2M1) {
+    if (this->getLhsKPack())
+      k *= 2;
+  }
+  auto cShape = this->getC().getType().getShape();
+  int64_t mDim = cShape[cShape.size() - 2];
+  int64_t nDim = cShape[cShape.size() - 1];
+
+  if (getAScale()) {
+    auto aScaleShape = getAScale().getType().getShape();
+    if (aScaleShape[rank - 2] != mDim)
+      return this->emitError(
+          "scales M dimension must match the operand M dimension");
+    int scale_factor =
+        isa<Float8E4M3FNType>(getAScale().getType().getElementType()) ? 16 : 32;
+    if (aScaleShape[rank - 1] != k / scale_factor)
+      return this->emitError("scales K dimension must match the operand K "
+                             "divided by the scale factor");
+  }
+  if (getBScale()) {
+    auto bScaleShape = getBScale().getType().getShape();
+    if (bScaleShape[rank - 2] != nDim)
+      return this->emitError(
+          "scales N dimension must match the operand N dimension");
+    int scale_factor =
+        isa<Float8E4M3FNType>(getBScale().getType().getElementType()) ? 16 : 32;
+    if (bScaleShape[rank - 1] != k / scale_factor)
+      return this->emitError("scales K dimension must match the operand K "
+                             "divided by the scale factor");
+  }
+  return success();
+}
+
 //-- MakeRangeOp --
 OpFoldResult MakeRangeOp::fold(FoldAdaptor adaptor) {
   // make_range(start, start + 1) -> constant(start)
diff --git a/python/test/unit/language/test_compile_errors.py b/python/test/unit/language/test_compile_errors.py
@@ -489,3 +489,23 @@ def kernel(N: tl.constexpr):
 
     with pytest.raises(CompilationError, match="N marked as constexpr and listed in do_not_specialize"):
         kernel[(1, )](5)
+
+
+def test_dot_scaled_shape_verification(fresh_triton_cache):
+
+    @triton.jit
+    def kernel():
+        M: tl.constexpr = 32
+        K: tl.constexpr = 64
+        N: tl.constexpr = 32
+        a = tl.full((M, K), 0, tl.uint8)
+        b = tl.full((K, N), 0, tl.uint8)
+        lhs_scale_wrong = tl.full((M, 4), 0, tl.uint8)
+        rhs_scale = tl.full((N, 2), 0, tl.uint8)
+        acc = tl.full((M, N), 0.0, tl.float32)
+        tl.dot_scaled(a, lhs_scale_wrong, "e5m2", b, rhs_scale, "e5m2", acc, False, True, True, tl.float32)
+
+    with pytest.raises(CompilationError) as e:
+        triton.compile(triton.compiler.ASTSource(fn=kernel, signature={}, constexprs={}))
+
+    assert str(e.value.__cause__) == "lhs_scale must be a tensor of shape [32, 2]. Got ['32', '4']"
diff --git a/python/triton/language/semantic.py b/python/triton/language/semantic.py
@@ -1590,6 +1590,20 @@ def _bitcast_to_fp_type(self, val: TensorTy, float_format: str):
             assert val.dtype == unsigned_ty, f"Unexpected dtype for {float_format}. Got {val.dtype}"
             return self.bitcast(val, triton_ty)
 
+    def verify_scaled_shape(self, M, N, K, lhs_scale, rhs_scale):
+        if lhs_scale is not None:
+            scale_factor = 16 if lhs_scale.dtype.is_fp8e4nv() else 32
+            lhs_scale_shape = lhs_scale.type.shape
+            assert lhs_scale_shape == [
+                M, K // scale_factor
+            ], f"lhs_scale must be a tensor of shape [{M}, {K // scale_factor}]. Got {lhs_scale_shape}"
+        if rhs_scale is not None:
+            scale_factor = 16 if rhs_scale.dtype.is_fp8e4nv() else 32
+            rhs_scale_shape = rhs_scale.type.shape
+            assert rhs_scale_shape == [
+                N, K // scale_factor
+            ], f"rhs_scale must be a tensor of shape [{N}, {K // scale_factor}]. Got {rhs_scale_shape}"
+
     def dot_scaled(self, lhs: TensorTy, lhs_scale: TensorTy, lhs_format: str, rhs: TensorTy,
                    rhs_scale: Optional[TensorTy], rhs_format: str, acc: TensorTy | None, fast_math: bool,
                    lhs_k_pack: bool, rhs_k_pack: bool, out_dtype: tl.dtype) -> TensorTy:
@@ -1621,8 +1635,11 @@ def dot_scaled(self, lhs: TensorTy, lhs_scale: TensorTy, lhs_format: str, rhs: T
         assert PACKED_B_DIM == PACKED_A_DIM, f"Reduction dimension should pack the same number of elements; (lhs: {lhs.shape} vs rhs: {rhs.shape})"
         #assert K * PACKED_B >= 64, f"scaled_dot NYI for K < 64. Got {K=}"
         B = lhs.type.shape[0] if lhs_rank == 3 else None
+        K = K_LHS
         if not lhs_k_pack:
             M = M * PACKED_A
+        else:
+            K = K * PACKED_A
         if not rhs_k_pack:
             N = N * PACKED_B
         ret_ty = tl.block_type(out_dtype, [B, M, N] if B else [M, N])
@@ -1634,6 +1651,8 @@ def dot_scaled(self, lhs: TensorTy, lhs_scale: TensorTy, lhs_format: str, rhs: T
             assert acc.type.shape == ret_ty.shape and acc.type.element_ty == out_dtype
         rhs_scale_handle = None if rhs_scale_is_none else rhs_scale.handle
         lhs_scale_handle = None if lhs_scale_is_none else lhs_scale.handle
+        self.verify_scaled_shape(M, N, K, None if lhs_scale_is_none else lhs_scale,
+                                 None if rhs_scale_is_none else rhs_scale)
         return self.tensor(
             self.builder.create_dot_scaled(lhs.handle, lhs_scale_handle, lhs_format_enum, rhs.handle, rhs_scale_handle,
                                            rhs_format_enum, fast_math, lhs_k_pack, rhs_k_pack, acc_handle), ret_ty)
diff --git a/test/Conversion/tritongpu_to_llvm_sm120.mlir b/test/Conversion/tritongpu_to_llvm_sm120.mlir
@@ -9,17 +9,17 @@ module attributes {"ttg.target" = "cuda:120", "ttg.num-ctas" = 1 : i32, "ttg.num
   // CHECK: mma.sync.aligned.m16n8k32.row.col.kind::mxf8f6f4.block_scale.scale_vec::1X
   tt.func public @sm120_mmav2_dot_scaled(
     %a: tensor<128x32xf8E5M2, #blocked_k>,
-    %sa: tensor<128x2xi8, #blocked>,
+    %sa: tensor<128x1xi8, #blocked>,
     %b: tensor<32x128xf8E5M2, #blocked>,
-    %sb: tensor<128x2xi8, #blocked>,
+    %sb: tensor<128x1xi8, #blocked>,
     %out: !tt.ptr<f32>
   ){
     %c = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #blocked>
     %a_d = ttg.convert_layout %a : tensor<128x32xf8E5M2, #blocked_k> -> tensor<128x32xf8E5M2, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>
     %b_d = ttg.convert_layout %b : tensor<32x128xf8E5M2, #blocked> -> tensor<32x128xf8E5M2, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 4}>>
     %d = tt.dot_scaled %a_d scale %sa, %b_d scale %sb, %c lhs = e5m2 rhs = e5m2 {fastMath = false}
-      : tensor<128x32xf8E5M2, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>, tensor<128x2xi8, #blocked>
-        * tensor<32x128xf8E5M2, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 4}>>, tensor<128x2xi8, #blocked>
+      : tensor<128x32xf8E5M2, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>, tensor<128x1xi8, #blocked>
+        * tensor<32x128xf8E5M2, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 4}>>, tensor<128x1xi8, #blocked>
         -> tensor<128x128xf32, #blocked>
     %out_splat = tt.splat %out : !tt.ptr<f32> -> tensor<128x1x!tt.ptr<f32>, #blocked>
     %out_ptrs = tt.broadcast %out_splat : tensor<128x1x!tt.ptr<f32>, #blocked> -> tensor<128x128x!tt.ptr<f32>, #blocked>
diff --git a/test/Triton/invalid.mlir b/test/Triton/invalid.mlir
@@ -541,6 +541,21 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 32 : i32}
 
 // -----
 
+module {
+  tt.func @dot_scaled_invalid_dims(
+    %a: tensor<128x128xf8E4M3FN>,
+    %b: tensor<128x128xf8E4M3FN>,
+    %a_scale: tensor<128x128xi8>,
+    %b_scale: tensor<128x4xi8>) -> tensor<128x128xf32> {
+    %cst = arith.constant dense<0.000000e+00> : tensor<128x128xf32>
+    // expected-error @below {{scales K dimension must match the operand K divided by the scale factor}}
+    %result = tt.dot_scaled %a scale %a_scale, %b scale %b_scale, %cst lhs = e4m3 rhs = e4m3 {fastMath = true} : tensor<128x128xf8E4M3FN>, tensor<128x128xi8>  * tensor<128x128xf8E4M3FN>, tensor<128x4xi8>-> tensor<128x128xf32>
+    tt.return %result : tensor<128x128xf32>
+  }
+}
+
+// -----
+
 tt.func @unsplat_invalid(%arg0: tensor<128xf32>) {
   // expected-error @below {{source tensor must have exactly one element}}
   %0 = tt.unsplat %arg0 : tensor<128xf32>
diff --git a/test/TritonGPU/accelerate-matmul.mlir b/test/TritonGPU/accelerate-matmul.mlir
@@ -680,18 +680,18 @@ module attributes {"ttg.target" = "cuda:120", "ttg.num-ctas" = 1 : i32, "ttg.num
   // CHECK-LABEL: @sm120_dot_scaled_basic
   tt.func public @sm120_dot_scaled_basic(
     %a: tensor<128x32xi8, #blocked_k>,
-    %scale_a: tensor<128x2xi8, #blocked>,
+    %scale_a: tensor<128x1xi8, #blocked>,
     %b: tensor<32x128xi8, #blocked>,
-    %scale_b: tensor<128x2xi8, #blocked>
+    %scale_b: tensor<128x1xi8, #blocked>
   ) -> tensor<128x128xf32, #blocked> {
     %cst = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #blocked>
     // CHECK-DAG: tt.dot_scaled
     // CHECK-DAG: #linear
     // CHECK-DAG: #linear1
     // CHECK-NOT: ttng.tc_gen5_mma_scaled
     %d = tt.dot_scaled %a scale %scale_a, %b scale %scale_b, %cst lhs = e4m3 rhs = e4m3 {fastMath = false}
-      : tensor<128x32xi8, #blocked_k>, tensor<128x2xi8, #blocked>
-        * tensor<32x128xi8, #blocked>, tensor<128x2xi8, #blocked>
+      : tensor<128x32xi8, #blocked_k>, tensor<128x1xi8, #blocked>
+        * tensor<32x128xi8, #blocked>, tensor<128x1xi8, #blocked>
         -> tensor<128x128xf32, #blocked>
     tt.return %d : tensor<128x128xf32, #blocked>
   }

Original file line number	Diff line number	Diff line change
`@@ -732,6 +732,7 @@ def TT_DotScaledOp : TT_Op<"dot_scaled", [Pure,`
`732`	`732`	`lhs` `=` $a_elem_type `rhs` `=` $b_elem_type attr-dict
`733`	`733`	`:` type($a) (`,` type($a_scale)^)? `*` type($b) (`,` type($b_scale)^)? `->` type($d)
`734`	`734`	`}];`
	`735`	`+ let hasVerifier = 1;`
`735`	`736`	`}`
`736`	`737`
`737`	`738`	`//`