[triton_kernels][matmul] support inputs with 0 elements (#7808)

ptillet · web-flow · commit de4376e90a3c · 2025-08-09T20:20:32.000-07:00
diff --git a/python/triton_kernels/tests/test_matmul.py b/python/triton_kernels/tests/test_matmul.py
@@ -161,6 +161,13 @@ class Case:
     ", ".join(f.name for f in fields(Case)),
     [
         tuple(getattr(case, f.name) for f in fields(Case)) for case in [
+            # Zero-sized args:
+            Case(0, 5, 7, "ragged", "float16", "float16"),
+            Case(5, 0, 7, "ragged", "float16", "float16"),
+            Case(5, 7, 0, "ragged", "float16", "float16"),
+            Case(0, 5, 7, "batched", "float16", "float16"),
+            Case(5, 0, 7, "batched", "float16", "float16"),
+            Case(5, 7, 0, "batched", "float16", "float16"),
             # Non-mx types:
             Case(16, 256, 256, "ragged", "float16", "float16", 128, 4),
             Case(16, 256, 256, "ragged", "float16", "float16", 128, 4, n_expt_shards=2),
@@ -301,7 +308,7 @@ def test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, has_y_gammas
                 pytest.skip("Hopper swizzling acts on a 64x64 tile (4x1 mma tiles).")
 
     # launch metadata for batched / mx types may not work yet.
-    test_launch_metadata = (mode == "ragged") and ("mx" not in weight_dtype_str) and fused_scatter
+    test_launch_metadata = (mode == "ragged") and ("mx" not in weight_dtype_str) and fused_scatter and m*n*k != 0
 
     torch.manual_seed(0)
 
@@ -349,7 +356,7 @@ def test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, has_y_gammas
                                                                  has_y_gammas, requires_grad=test_bwd, device=device)
     x_ref, w_ref, bias_ref, gs0_ref, gs1_ref = apply_precision(x_tri, w_tri, bias_tri, gs0_tri, gs1_tri, precision_opt)
 
-    if w_tri.shape[0] == 1:
+    if w_tri.shape[0] == 1 and mode != "batched":
         # Test the case when weight has dim 2, i.e., shape (K, N).
         w_tri = w_tri.squeeze(0).detach().requires_grad_(test_bwd)
         w_ref = w_ref.squeeze(0).detach().requires_grad_(test_bwd)
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs.py
@@ -6,6 +6,7 @@
 import torch
 import triton
 from enum import Enum, auto
+import math
 # utilities
 from triton_kernels import target_info
 from triton_kernels.numerics import InFlexData, OutFlexData
@@ -458,6 +459,11 @@ def matmul_ogs(x, w, bias,
         opt_flags, preprocessing_features, postprocessing_features
     )
     memory = apply_allocation(allocation, y)
+    if batch_size * M * N == 0:
+        ret = memory["output"].squeeze(0)
+        if not is_input_batched:
+            ret = ret.squeeze(0)
+        return ret
     # TMA descriptors require a global memory allocation
     if opt_flags.is_persistent:
         triton.set_allocator(get_per_device_per_stream_alloc_fn(x.device))
@@ -509,7 +515,7 @@ def matmul_ogs(x, w, bias,
     has_scatter = writeback_idxs is not None
     has_gather_tma = has_gather and target_info.has_tma_gather()
     has_scatter_tma = has_scatter and target_info.has_tma_gather()
-    y = wrap_torch_tensor(out0.view(-1, out0.shape[-1]) if has_scatter else out0.view(-1, *out0.shape[-2:]))
+    y = wrap_torch_tensor(out0.view(math.prod(out0.shape[:-1]), out0.shape[-1]) if has_scatter else out0.view(math.prod(out0.shape[:-2]), *out0.shape[-2:]))
     x_storage = _canonicalize_storage(x.storage, 2 if has_gather_tma else 3, flex.lhs_data)
     w_storage = _canonicalize_storage(w.storage, 3, flex.rhs_data)
     y_storage = _canonicalize_storage(y.storage, 2 if has_scatter_tma else 3, flex.out_data)
diff --git a/python/triton_kernels/triton_kernels/testing.py b/python/triton_kernels/triton_kernels/testing.py
@@ -22,6 +22,9 @@ def assert_close(ref, tri, maxtol=None, rmstol=None, description="--", verbose=T
             return
         ref = ref_as_type
 
+    if ref.numel() == 0:
+        return
+
     if maxtol is None:
         maxtol = 2e-2
     if rmstol is None: