[triton_kernels][matmul] skip some unnecessary compute (#7140)

ptillet · web-flow · commit 7be5b8a79d82 · 2025-06-10T18:31:21.000-07:00
diff --git a/python/triton_kernels/tests/test_matmul.py b/python/triton_kernels/tests/test_matmul.py
@@ -414,7 +414,15 @@ def round_x(x, idx):
                              rdata, gindx, sindx, round_x=round_x, round_y=round_y, gammas=gs1_ref)
     scale = lambda val, scal: val if scal is None else val / scal
     if n_expt_shards > 1:
-        if not do_scatter:
+        if do_scatter:
+            indx = sindx.dst_indx[sindx.dst_indx != -1]
+            ref_y = ref_y[indx // n_expts_act, :]
+            if act_is_float8:
+                tri_y = tri_y.view(torch.int8)
+            tri_y = tri_y[indx // n_expts_act, :]
+            if act_is_float8:
+                tri_y = tri_y.view(act_dtype)
+        else:
             n_rows = rdata.expt_hist.sum()
             assert n_rows > 0
             ref_y = ref_y[:n_rows]
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs.py
@@ -574,12 +574,12 @@ def init_allocation(x, w, precision_config, fused_activation, routing_data, gath
 def apply_allocation(allocation: MatmulAllocation, output):
     ret = dict()
     if output is None:
-        output = torch.zeros(allocation.output[0], device=allocation.device, dtype=allocation.output[1])
+        output = torch.empty(allocation.output[0], device=allocation.device, dtype=allocation.output[1])
     else:
         assert output.shape == allocation.output[0]
     ret["output"] = output[None, :, :]
     ret["scratchpad"] = {
-        k: torch.zeros(v[0], device=allocation.device, dtype=v[1])
+        k: torch.empty(v[0], device=allocation.device, dtype=v[1])
             for k, v in allocation.scratchpads.items()
     }
     return ret
@@ -837,7 +837,6 @@ def matmul_ogs(x, w, bias,
     out = apply_postprocessing_features(scatter_indx, finalize_scatter_idxs, opt_flags, expt_token_offs_raw,
                                 num_indx, precision_config, routing_data,
                                 postprocessing_features, memory, fused_postprocess_activation, epilogue)
-
     # remove split-k
     out = out.squeeze(0)
     if not is_input_batched:
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/_matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/_matmul_ogs.py
@@ -385,9 +385,9 @@ def _compute_writeback_idx(
     src_offs = offs_m[:, None] * N_EXPTS_ACT + tl.arange(0, N_EXPTS_ACT)[None, :]
     src_idxs = tl.load(ScatterSrcIndx + src_offs, mask=mask_m[:, None], other=-1)
     is_src_active = (src_idxs != -1).to(tl.int32)
-    has_one_active = tl.sum(is_src_active, axis=1) == 1
+    num_src_active = tl.sum(is_src_active, axis=1)
 
-    need_finalize_scatter = mask_m & (~has_one_active)
+    need_finalize_scatter = mask_m & (num_src_active > 1)
     finalize_scatter_count = tl.sum(need_finalize_scatter.to(tl.int32))
     if finalize_scatter_count == 0:
         return