[FRONTEND] Relax minimum dot size on Nvidia target (#7451)

ThomasRaoux · web-flow · commit 4f6b11c561f5 · 2025-07-10T12:09:47.000-07:00
We now support using tensorcores with padding so we can relax the
minimum dot size
diff --git a/python/test/unit/language/test_compile_errors.py b/python/test/unit/language/test_compile_errors.py
@@ -396,9 +396,9 @@ def test_min_dot_size(dtype):
     error_msg = "Input shapes should have "
     if is_cuda():
         if dtype.primitive_bitwidth == 8:
-            error_msg += "M >= 16, N >= 8 and K >= 32"
+            error_msg += "M >= 1, N >= 1 and K >= 32"
         else:
-            error_msg = "M >= 16, N >= 8 and K >= 16"
+            error_msg = "M >= 1, N >= 1 and K >= 16"
     elif is_hip():
         # hip supports arbitrary sizes
         error_msg = None
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -3838,6 +3838,13 @@ def get_test_dot_vdot2_cases():
             (4, 32, 32, 4, False, False, 'None', 'ieee', 'bfloat16', 'float32', 1, None)]
 
 
+def get_test_small_dots_cases():
+    if not is_cuda():
+        return []
+    return [(2, 4, 32, 1, False, False, 'None', 'ieee', 'float16', 'float32', 1, None),
+            (1, 2, 32, 1, False, False, 'None', 'ieee', 'float8e5', 'float32', 1, None)]
+
+
 @pytest.mark.interpreter
 @pytest.mark.parametrize(
     "M, N, K, num_warps, col_a, col_b, epilogue, input_precision, in_dtype, out_dtype, kpack, mma_nonk_size",
@@ -3851,15 +3858,16 @@ def get_test_dot_vdot2_cases():
     get_test_dot_fp8_output_cases() + \
     get_test_dot_small_k_mfma_cases() + \
     get_test_dot_small_mn_fma_cases() + \
-    get_test_dot_softmax())
+    get_test_dot_softmax() + \
+    get_test_small_dots_cases())
 @pytest.mark.parametrize("num_ctas", num_ctas_list)
 def test_dot(M, N, K, num_warps, col_a, col_b, epilogue, input_precision, in_dtype, out_dtype, kpack, mma_nonk_size,
              num_ctas, device):
     if is_interpreter():
         if in_dtype == 'bfloat16':
             pytest.skip("bfloat16 is not supported in the interpreter")
     else:
-        if not is_hip() and (M < 16 or N < 16 or K < 16):
+        if not is_hip() and K < 16:
             pytest.skip("small dots are supported only on HIP at the moment")
         if is_cuda():
             capability = torch.cuda.get_device_capability()
@@ -4097,10 +4105,12 @@ def kernel(X, stride_xm, stride_xk, Y, stride_yk, stride_yn, W, stride_wn, strid
             assert 'wgmma.mma_async.sync.aligned' in ptx or\
                 'mma.sync.aligned.m16n8k32.row.col.satfinite.s32.s8.s8.s32' in ptx
     elif in_dtype == "float8e5" and out_dtype == tl.float32:
-        if capability[0] == 9:
+        if capability[0] == 9 and M >= 64 and N >= 8:
             assert 'wgmma.mma_async.sync.aligned.m64n128k32.f32.e5m2.e5m2' in ptx
+        elif capability[0] >= 8 and M < 64:
+            assert 'mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32' in ptx
     elif in_dtype == "float8e4nv" and out_dtype == tl.float32:
-        if capability[0] == 9:
+        if capability[0] == 9 and M >= 64 and N >= 8:
             assert 'wgmma.mma_async.sync.aligned.m64n128k32.f32.e4m3.e4m3' in ptx
     if is_tcgen5 and epilogue == 'softmax' and M >= 128:
         # check that there is no shared memory exchange in the softmax
diff --git a/third_party/nvidia/backend/compiler.py b/third_party/nvidia/backend/compiler.py
@@ -22,10 +22,11 @@ def check_dot_compatibility(lhs_type, rhs_type) -> Tuple[int, int, int]:  # [m,
         lhs_bitwidth = lhs_type.scalar.primitive_bitwidth
         rhs_bitwidth = rhs_type.scalar.primitive_bitwidth
         assert lhs_bitwidth == rhs_bitwidth, "lhs and rhs bitwidth must be the same"
+        # For small M/N the input we can still use tensorcores with padding.
         if lhs_bitwidth == 8:
-            return (16, 8, 32)
+            return (1, 1, 32)
         else:
-            return (16, 8, 16)
+            return (1, 1, 16)
 
     return check_dot_compatibility