[kernels] restore old behavior that output for tokens routed to zero experts should be zero-initialized (#7150)

ptillet · web-flow · commit e1fb6f69a161 · 2025-06-11T01:06:43.000-07:00
#7140 introduced a subtle change in the semantics of `matmul_ogs`. We actually care that the output of rows that have scatter_indx==-1 be zero-initialized because some expert parallelism code may reduce them also found some missing mask in the AMD implementation, which most likely explains the test failure.
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/_finalize_matmul.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/_finalize_matmul.py
@@ -291,7 +291,7 @@ def _finalize_matmul(
                         if src_idx != -1:
                             As = A + src_idx.to(tl.int64) * stride_a_m + offs_n
                             for ki in tl.static_range(K):
-                                acc += tl.load(As, mask=n_mask, other=0.0)
+                                acc += tl.load(As, mask=(src_idxs != -1)[:, None] & n_mask[None, :], other=0.0)
                                 As += stride_a_k
                 else:
                     As = A + src_idxs.to(tl.int64)[:, None] * stride_a_m + offs_n[None, :]
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/_matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/_matmul_ogs.py
@@ -387,7 +387,7 @@ def _compute_writeback_idx(
     is_src_active = (src_idxs != -1).to(tl.int32)
     num_src_active = tl.sum(is_src_active, axis=1)
 
-    need_finalize_scatter = mask_m & (num_src_active > 1)
+    need_finalize_scatter = mask_m & (num_src_active != 1)
     finalize_scatter_count = tl.sum(need_finalize_scatter.to(tl.int32))
     if finalize_scatter_count == 0:
         return