[FRONTEND] Fix and improve minimum dot size checks (#5383)

Jokeren · web-flow · commit 5700c1468773 · 2024-12-09T19:43:26.000-05:00
1. Fix the problem that [m, k, n] but not [m, n, k] is returned on the
nvidia backend
2. Check both int8 and float8
3. Add a new compiler error test
4. Fix dtype check in AMD backend
diff --git a/python/test/unit/language/test_compile_errors.py b/python/test/unit/language/test_compile_errors.py
@@ -7,7 +7,7 @@
 import triton.language as tl
 from triton.compiler.errors import CompilationError, CompileTimeAssertionFailure
 import traceback
-from triton._internal_testing import is_interpreter, is_cuda, is_hip, is_hip_mi300
+from triton._internal_testing import is_interpreter, is_cuda, is_hip, is_hip_mi300, is_hip_mi200
 
 
 def test_err_undefined_variable():
@@ -379,6 +379,42 @@ def dtype_kernel(dtype: tl.constexpr):
             raise assertion_err from e.value
 
 
+@pytest.mark.parametrize("dtype", [tl.float8e5, tl.int8, tl.float16])
+def test_min_dot_size(dtype):
+    error_msg = "Input shapes should have "
+    if is_cuda():
+        if dtype.primitive_bitwidth == 8:
+            error_msg += "M >= 16, N >= 16 and K >= 32"
+        else:
+            error_msg = "M >= 16, N >= 16 and K >= 16"
+    elif is_hip_mi300():
+        if dtype.is_int8():
+            error_msg += "M >= 16, N >= 16 and K >= 16"
+        else:
+            error_msg += "M >= 16, N >= 16 and K >= 8"
+    elif is_hip_mi200():
+        error_msg += "M >= 16, N >= 16 and K >= 8"
+    elif is_hip():
+        error_msg = "M >= 16, N >= 16 and K >= 16"
+    else:
+        pytest.skip("Test only supported on CUDA and HIP")
+
+    @triton.jit
+    def dot_kernel(dtype: tl.constexpr):
+        SIZE: tl.constexpr = 8
+        a = tl.full((SIZE, SIZE), 0.0, dtype)
+        b = tl.full((SIZE, SIZE), 0.0, dtype)
+        tl.dot(a, b)
+
+    with pytest.raises(CompilationError) as e:
+        triton.compile(
+            triton.compiler.ASTSource(fn=dot_kernel, signature={"dtype": "constexpr"}, constexprs={"dtype": dtype}))
+    try:
+        assert (error_msg in str(e.value.__cause__))
+    except AssertionError as assertion_err:
+        raise assertion_err from e.value
+
+
 def test_max_num_imprecise_acc_limit():
 
     @triton.jit
diff --git a/python/triton/language/semantic.py b/python/triton/language/semantic.py
@@ -1473,6 +1473,7 @@ def dot(lhs: tl.tensor, rhs: tl.tensor, acc: tl.tensor, input_precision: Optiona
         assert lhs.dtype == rhs.dtype, f"Both operands must be same dtype. Got {lhs.dtype} and {rhs.dtype}"
 
     if lhs.dtype.is_fp8e4b15() or rhs.dtype.is_fp8e4b15():
+        # We upcast because there's no fp8e4b15 type in MLIR
         lhs = cast(lhs, tl.float16, builder)
         rhs = cast(rhs, tl.float16, builder)
 
diff --git a/third_party/amd/backend/compiler.py b/third_party/amd/backend/compiler.py
@@ -18,7 +18,8 @@ def min_dot_size(target: GPUTarget):
     # CDNA 3.0 supports k==8 in all mfma variants except for int8
     # (where the smallest `k` supported is 16)
     if "gfx94" in arch_str:
-        return lambda lhsType, rhsType: (16, 16, 16) if (lhsType.is_int8() or rhsType.is_int8()) else (16, 16, 8)
+        return lambda lhsType, rhsType: (16, 16, 16) if (lhsType.scalar.is_int8() or rhsType.scalar.is_int8()) else (
+            16, 16, 8)
     # CDNA 2.0 always supports `k==8`
     if "gfx9" in arch_str:
         return lambda lhsType, rhsType: (16, 16, 8)
diff --git a/third_party/nvidia/backend/compiler.py b/third_party/nvidia/backend/compiler.py
@@ -17,7 +17,17 @@
 
 
 def min_dot_size(target: GPUTarget):
-    return lambda lhsType, rhsType: (16, 32, 16) if lhsType.is_int8() else (16, 16, 16)
+
+    def check_dot_compatibility(lhs_type, rhs_type) -> Tuple[int, int, int]:  # [m, n, k]
+        lhs_bitwidth = lhs_type.scalar.primitive_bitwidth
+        rhs_bitwidth = rhs_type.scalar.primitive_bitwidth
+        assert lhs_bitwidth == rhs_bitwidth, "lhs and rhs bitwidth must be the same"
+        if lhs_bitwidth == 8:
+            return (16, 16, 32)
+        else:
+            return (16, 16, 16)
+
+    return check_dot_compatibility
 
 
 @functools.lru_cache()