[KERNELS] Support y_indx and uniform distribution (#8472)

Jokeren · web-flow · commit 273649eab56f · 2025-10-17T18:15:36.000-04:00
diff --git a/python/triton_kernels/tests/test_distributed.py b/python/triton_kernels/tests/test_distributed.py
@@ -27,6 +27,23 @@ def _make_expt_dict_for_mode(n_shards, n_expts_tot, affinity_mode):
         raise ValueError(f"Unknown affinity mode: {affinity_mode}") from exc
 
 
+def _make_y_indx_for_mode(n_tokens_global, n_expts_tot, n_expts_act, n_shards, affinity_mode, dev):
+    y_indx_global = None
+    if affinity_mode == "uniform":
+        if n_expts_tot % n_shards != 0:
+            raise ValueError("uniform affinity requires experts evenly divisible by shards")
+        expts_per_rank = n_expts_tot // n_shards
+        rounds = (n_expts_act + n_shards - 1) // n_shards
+        if rounds > expts_per_rank:
+            raise ValueError("round-robin selection exceeds experts available per shard")
+        order = torch.arange(n_expts_act, device=dev, dtype=torch.int32)
+        shard_order = order % n_shards
+        intra_shard = order // n_shards
+        round_robin_indx = (shard_order * expts_per_rank + intra_shard).to(torch.int16)
+        y_indx_global = round_robin_indx.unsqueeze(0).expand(n_tokens_global, -1).contiguous()
+    return y_indx_global
+
+
 # ------------------------------------------------------------
 # fixture
 # ------------------------------------------------------------
@@ -102,8 +119,8 @@ def test_make_expt_assignment(n_expts_shard, n_expts_tot, affinity_mode):
 # ------------------------------------------------------------
 
 
-def routing(logits, n_expts_act, all_gather=False):
-    sparse_logits = topk(logits, n_expts_act, all_gather=all_gather)
+def routing(logits, n_expts_act, all_gather=False, y_indx=None):
+    sparse_logits = topk(logits, n_expts_act, all_gather=all_gather, y_indx=y_indx)
     dispatch_indx = sparse_logits.mask_metadata.col_sorted_indx
     combine_indx = sparse_logits.mask_metadata.row_sorted_indx
     ragged_batch_metadata = make_ragged_tensor_metadata(sparse_logits.mask_metadata.col_sum, dispatch_indx.shape[0])
@@ -115,17 +132,18 @@ def routing(logits, n_expts_act, all_gather=False):
     return routing_data, gather_idx, scatter_idx, sparse_logits.indx
 
 
-def mixture_of_expt_nosharded(x_global, l_global, w_global, b_global, n_expts_act):
-    rdata, combine_indx, dispatch_indx, _ = routing(l_global, n_expts_act)
+def mixture_of_expt_nosharded(x_global, l_global, w_global, b_global, n_expts_act, y_indx=None):
+    rdata, combine_indx, dispatch_indx, _ = routing(l_global, n_expts_act, y_indx=y_indx)
     y_global = matmul_ogs(x_global, w_global, b_global, rdata, gather_indx=combine_indx, scatter_indx=dispatch_indx)
     return y_global
 
 
-def mixture_of_expt_epsharded(x_dp_local, l_dp_local, w_ep_local, b_ep_local, expt_assignment, n_expts_act):
+def mixture_of_expt_epsharded(x_dp_local, l_dp_local, w_ep_local, b_ep_local, expt_assignment, n_expts_act,
+                              y_indx=None):
     rank = dist.get_rank()
     expt_map = expt_assignment.expt_map[rank, :]
     # active global logits (sparse)
-    l_global_active = topk(l_dp_local, n_expts_act, apply_softmax=True, all_gather=True)
+    l_global_active = topk(l_dp_local, n_expts_act, apply_softmax=True, all_gather=True, y_indx=y_indx)
     # expert histogram, dispatch/combine indx
     active_indx = l_global_active.indx
     expt_sizes = l_global_active.mask_metadata.col_sum
@@ -264,7 +282,15 @@ def _run_expert_sharding(rank, world_size, *, n_tokens, d_model, n_expts_tot, n_
     l_dp_local = l_global[first_token_indx:last_token_indx, :]
     # routing
     # test correctness
-    y_global_ref = mixture_of_expt_nosharded(x_global, l_global, w_global, b_global, n_expts_act)
+    y_indx_global = _make_y_indx_for_mode(n_tokens_global, n_expts_tot, n_expts_act, n_shards, affinity_mode, dev)
+    y_global_ref = mixture_of_expt_nosharded(
+        x_global,
+        l_global,
+        w_global,
+        b_global,
+        n_expts_act,
+        y_indx=y_indx_global,
+    )
 
     def run_mixture():
         return mixture_of_expt_epsharded(
@@ -274,6 +300,7 @@ def run_mixture():
             b_ep_local,
             expt_assignment,
             n_expts_act,
+            y_indx=y_indx_global,
         )
 
     # test cuda graph capture + replay with symmetric memory
diff --git a/python/triton_kernels/triton_kernels/topk.py b/python/triton_kernels/triton_kernels/topk.py
@@ -33,7 +33,6 @@ def topk_forward(x, k, apply_softmax=True, dim=1, y_indx=None, n_rows=None, all_
     assert len(x.shape) == 2
     assert x.shape_max[-1] < 32768
     assert dim == 1
-    assert not all_gather or not use_provided_indx
     n_rows, n_cols = x.shape
     n_rows_max, _ = x.shape_max
     dev = x.device
@@ -62,7 +61,8 @@ def topk_forward(x, k, apply_softmax=True, dim=1, y_indx=None, n_rows=None, all_
     )
     if all_gather:
         y_vals_hdl.barrier(channel=0)
-        y_indx_hdl.barrier(channel=0)
+        if y_indx_hdl is not None:
+            y_indx_hdl.barrier(channel=0)
         bitmatrix_hdl.barrier(channel=0)
     bitmatrix_shape = [n_rows * dist.get_world_size() if all_gather else n_rows, n_cols]
     bitmatrix_shape_max = [n_rows_out_max, None]
diff --git a/python/triton_kernels/triton_kernels/topk_details/_topk_forward.py b/python/triton_kernels/triton_kernels/topk_details/_topk_forward.py
@@ -115,7 +115,7 @@ def _topk_forward(X, stride_xm,  # inputs
     mask_m = offs_m[:, None] < n_rows
     if USE_PROVIDED_INDX:
         tl.static_assert(len(PeerYis) == 1)
-        Yi_ptrs = PeerYis[0] + offs_m[:, None] * stride_ym + offs_y_n[None, :]
+        Yi_ptrs = PeerYis[0] + (dst_offs_m + offs_m[:, None]) * stride_ym + offs_y_n[None, :]
         y_indices = tl.load(Yi_ptrs, mask=mask_m)
         Xv_ptrs = X + offs_m[:, None] * stride_xm + y_indices
         y_values = tl.load(Xv_ptrs, mask=mask_m)