Fix BitmatrixMetadata col/row_sorted_indx (#8599)

wuweil-openai · web-flow · commit a1316c4c4e01 · 2025-10-30T15:04:11.000-07:00
`col/row_sorted_indx` were passed to the constructor in wrong order. The
user side (`combine_indx` and `dispatch_indx`) also points to the wrong
index so the error was cancelled. This PR fixes the constructor to use
the right order and updates the user side.
diff --git a/python/triton_kernels/bench/distributed.py b/python/triton_kernels/bench/distributed.py
@@ -165,8 +165,8 @@ def routing(
             )
             active_indx = logits_global.indx
             expt_sizes = logits_global.mask_metadata.col_sum
-            dispatch_indx = logits_global.mask_metadata.col_sorted_indx
-            combine_indx = logits_global.mask_metadata.row_sorted_indx
+            dispatch_indx = logits_global.mask_metadata.row_sorted_indx
+            combine_indx = logits_global.mask_metadata.col_sorted_indx
             logits_global_metadata = make_ragged_tensor_metadata(expt_sizes, dispatch_indx.shape[0])
             x = convert_dp_to_ep(x, expt_assignment, active_indx, dispatch_indx)
             logits_local_metadata = remap_ragged_tensor_metadata(logits_global_metadata, expt_map)
@@ -184,8 +184,8 @@ def routing(
     else:
         # If mode is not specified or we have a single process, we do single-GPU routing.
         logits = topk(logits, n_expts_act, y_indx=y_indx, apply_softmax=not sm_first)
-        dispatch_indx = logits.mask_metadata.col_sorted_indx
-        combine_indx = logits.mask_metadata.row_sorted_indx
+        dispatch_indx = logits.mask_metadata.row_sorted_indx
+        combine_indx = logits.mask_metadata.col_sorted_indx
         ragged_batch_metadata = make_ragged_tensor_metadata(logits.mask_metadata.col_sum, dispatch_indx.shape[0])
         gate_scal = logits.vals.flatten()[combine_indx]
         routing_data = RoutingData(gate_scal, ragged_batch_metadata.slice_sizes, n_expts_tot, n_expts_act,
diff --git a/python/triton_kernels/tests/test_distributed.py b/python/triton_kernels/tests/test_distributed.py
@@ -120,8 +120,8 @@ def test_make_expt_assignment(n_expts_shard, n_expts_tot, affinity_mode):
 
 def routing(logits, n_expts_act, all_gather=False, y_indx=None):
     sparse_logits = topk(logits, n_expts_act, all_gather=all_gather, y_indx=y_indx)
-    dispatch_indx = sparse_logits.mask_metadata.col_sorted_indx
-    combine_indx = sparse_logits.mask_metadata.row_sorted_indx
+    dispatch_indx = sparse_logits.mask_metadata.row_sorted_indx
+    combine_indx = sparse_logits.mask_metadata.col_sorted_indx
     ragged_batch_metadata = make_ragged_tensor_metadata(sparse_logits.mask_metadata.col_sum, dispatch_indx.shape[0])
     gate_scal = sparse_logits.vals.flatten()[combine_indx]
     routing_data = RoutingData(gate_scal, ragged_batch_metadata.slice_sizes, logits.shape[-1], n_expts_act,
@@ -146,8 +146,8 @@ def mixture_of_expt_epsharded(x_dp_local, l_dp_local, w_ep_local, b_ep_local, ex
     # expert histogram, dispatch/combine indx
     active_indx = l_global_active.indx
     expt_sizes = l_global_active.mask_metadata.col_sum
-    dispatch_indx = l_global_active.mask_metadata.col_sorted_indx
-    combine_indx = l_global_active.mask_metadata.row_sorted_indx
+    dispatch_indx = l_global_active.mask_metadata.row_sorted_indx
+    combine_indx = l_global_active.mask_metadata.col_sorted_indx
     # ragged tensor metadata
     x_global_metadata = make_ragged_tensor_metadata(expt_sizes, dispatch_indx.shape[0])
     # convert x from dp-local to expert-sorted, ep-local
diff --git a/python/triton_kernels/tests/test_matmul.py b/python/triton_kernels/tests/test_matmul.py
@@ -42,8 +42,8 @@ def alloc_rand_like(x):
 def init_routing_data(m, n_expts_tot, n_expts_act, do_gather, do_scatter, device="cuda"):
     logits = torch.randn((m, n_expts_tot), dtype=torch.float16, device=device, requires_grad=True)
     sparse_logits = topk(logits, n_expts_act)
-    dispatch_indx = sparse_logits.mask_metadata.col_sorted_indx
-    combine_indx = sparse_logits.mask_metadata.row_sorted_indx
+    dispatch_indx = sparse_logits.mask_metadata.row_sorted_indx
+    combine_indx = sparse_logits.mask_metadata.col_sorted_indx
     ragged_batch_metadata = make_ragged_tensor_metadata(sparse_logits.mask_metadata.col_sum, dispatch_indx.shape[0])
     routing_data = RoutingData(None, ragged_batch_metadata.slice_sizes, n_expts_tot, n_expts_act, ragged_batch_metadata)
     gather_idx = GatherIndx(combine_indx, dispatch_indx) if do_gather else None
diff --git a/python/triton_kernels/triton_kernels/tensor_details/bitmatrix.py b/python/triton_kernels/triton_kernels/tensor_details/bitmatrix.py
@@ -14,15 +14,15 @@ class BitmatrixMetadata:
                    1 1 1 0 0 0 1
                    0 0 1 0 1 0 0]
     `col_sum` = [1 2 3 0 2 2 1]
-    `row_sorted_indx` = cat([3 6 8], [1 9], [0 2 4 10], [5 7])
     `col_sorted_indx` = cat([5], [3 6], [0 7], [], [9 1 10], [2 4], [8])
+    `row_sorted_indx` = cat([3 6 8], [1 9], [0 2 4 10], [5 7])
     """
     # the number of entries equal to 1 in each column
     col_sum: torch.Tensor
-    # indices of nonzero values numbered col-major, grouped by rows, concatenated
-    row_sorted_indx: torch.Tensor
     # indices of nonzero values numbered row-major, grouped by cols, concatenated
     col_sorted_indx: torch.Tensor
+    # indices of nonzero values numbered col-major, grouped by rows, concatenated
+    row_sorted_indx: torch.Tensor
 
 
 # `make_bitmatrix_metadata`: entry point for optimized implementation
@@ -143,7 +143,11 @@ def make_bitmatrix_metadata(nonzero_indx, bitmatrix):
         col_offs,  #
         TOKS_PER_ROW=toks_per_row, BLOCK_PER_TOK=PARTIAL_BLOCK_M,  #
     )
-    return BitmatrixMetadata(col_sum, col_sorted_indx, row_sorted_indx)
+    return BitmatrixMetadata(
+        col_sum=col_sum,
+        col_sorted_indx=col_sorted_indx,
+        row_sorted_indx=row_sorted_indx,
+    )
 
 
 # `make_bitmatrix_metadata_torch`: entry point for reference implementation
@@ -157,4 +161,8 @@ def make_bitmatrix_metadata_torch(nonzero_indx, bitmatrix):
     col_sorted_indx = pad(torch.argsort(nonzero_indx[nonzero_indx != -1], stable=True), nonzero_indx.numel())
     row_sorted_indx = pad(torch.argsort(col_sorted_indx[col_sorted_indx != -1], stable=True), nonzero_indx.numel())
     col_sum = torch.histc(nonzero_indx, bins=n_batches, max=n_batches - 1).int()
-    return BitmatrixMetadata(col_sum, col_sorted_indx, row_sorted_indx)
+    return BitmatrixMetadata(
+        col_sum=col_sum,
+        col_sorted_indx=col_sorted_indx,
+        row_sorted_indx=row_sorted_indx,
+    )