[KERNELS] Fix launch metadata computations for matmul_ogs. (#8429)

yongjik · web-flow · commit 0766464fc963 · 2025-10-11T14:39:20.000-07:00
(Previous code was causing CUDA or python asserts for some cases.)
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs.py
@@ -467,7 +467,10 @@ def matmul_ogs(x, w, bias,
         assert routing_data is None
         assert gather_indx is None
         assert scatter_indx is None
-        routing_data = RoutingData(None, None, inner_routing_data.base.n_expts_tot, 1)
+        routing_data = RoutingData(
+            None, None, inner_routing_data.base.n_expts_tot, 1,
+            expected_tokens_per_expt=inner_routing_data.base.expected_tokens_per_expt,
+        )
     # canonicalize inputs
     if precision_config is None:
         precision_config = PrecisionConfig()
@@ -684,6 +687,7 @@ def matmul_ogs(x, w, bias,
                    N, K, K_W,
                    betas, gammas,
                    None if gather_indx is None else gather_indx.src_indx,
+                   None if gather_indx is None else gather_indx.dst_indx,  # Only for launch_metadata
                    None if scatter_indx is None else scatter_indx.src_indx,
                    num_indx,
                    None if not opt_flags.fused_scatter else scatter_indx.dst_indx,
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/_common.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/_common.py
@@ -1,4 +1,3 @@
-import torch
 import triton
 import triton.language as tl
 
@@ -221,7 +220,7 @@ def matmul_launch_metadata(grid, kernel, args):
         n_tokens = None
         n_w_bytes = W.numel() * W.element_size()
     if expt_is_inner:
-        K = int(n_tokens)
+        K = None if n_tokens is None else int(n_tokens)
     repr = lambda s, x: f"{s} = {x}" if x is not None else f"E_{len(hist)}({s}) = {n_rows}"
     nbits = X.dtype.itemsize * 8
     batch_repr = ""
@@ -238,20 +237,15 @@ def matmul_launch_metadata(grid, kernel, args):
     fM = M if M is not None else n_tokens
     ret[f"flops{nbits}"] = 2.0 * fM * N * K * (1 if expt_is_inner else batch_size)
 
-    gindx = args.get("GatherIndx", None)
+    dst = args.get("GatherDstIndx", None)
     # sindx = args.get("WriteBackIndx", None)
     n_x_bytes = X.numel() * X.element_size()
     n_y_bytes = Y.numel() * Y.element_size()
     if hist is not None:
         assert n_tokens is not None
         n_expts_act = args["N_EXPTS_ACT"]
 
-        if (gindx is not None) and launch_metadata_allow_sync():
-            # recreate inverse GatherIndx.
-            dst = torch.full_like(gindx, -1)
-            idx = torch.arange(len(gindx), device=gindx.device, dtype=torch.int32)
-            mask = gindx != -1
-            dst[gindx[mask]] = idx[mask]
+        if (dst is not None) and launch_metadata_allow_sync():
             n_read_rows = (dst.view((-1, n_expts_act)) != -1).any(dim=1).sum()
         else:
             n_read_rows = n_tokens
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/_matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/_matmul_ogs.py
@@ -54,7 +54,7 @@ def _matmul_ogs(
              M, N, K, K_W, # shapes
              # expt data
              Betas, Gammas,
-             GatherIndx,
+             GatherIndx, GatherDstIndx,  # GatherDstIndx is only used for launch metadata.
              ScatterSrcIndx, num_idxs,
              WriteBackIndx, writeback_size,
              ExptHist, ExptOffs, ExptTileOffs, ExptData,
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/_p_matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/_p_matmul_ogs.py
@@ -63,7 +63,7 @@ def _p_matmul_ogs(
              M, N, K, K_W, # shapes
              # expt data
              Betas, Gammas,
-             GatherIndx,
+             GatherIndx, GatherDstIndx,  # GatherDstIndx is only used for launch metadata.
              ScatterSrcIndx, num_idxs,
              WriteBackIndx, writeback_size,
              ExptHist, ExptOffs, ExptTileOffs, ExptData,