njriasan
diff --git a/‎bench/bench/bench_mlp.py‎
Lines changed: 8 additions & 6 deletions b/‎bench/bench/bench_mlp.py‎
Lines changed: 8 additions & 6 deletions
diff --git a/‎bench/tests/test_routing.py‎
Lines changed: 89 additions & 0 deletions b/‎bench/tests/test_routing.py‎
Lines changed: 89 additions & 0 deletions
diff --git a/‎bench/triton_bench/matmul_ogs.py‎
Lines changed: 3 additions & 1 deletion b/‎bench/triton_bench/matmul_ogs.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎bench/triton_bench/matmul_ogs_details/metadata.py‎
Lines changed: 83 additions & 0 deletions b/‎bench/triton_bench/matmul_ogs_details/metadata.py‎
Lines changed: 83 additions & 0 deletions
@@ -7,7 +7,7 @@
 from triton_bench.mxfp import downcast_to_mxfp
 from triton_bench.matmul_ogs import MicroscalingCtx, matmul_ogs, PrecisionConfig, FlexCtx
 from triton_bench.numerics import InFlexData
-from triton_bench.routing import routing_torch, simulate_expert_sharded_routing
+from triton_bench.routing import routing, simulate_expert_sharded_routing
 from triton_bench.meta import cuda_capability_geq
 
 
@@ -96,17 +96,19 @@ def bench_mlp(batch, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_dtype,
     for i in range(100):
         x = torch.randn((batch, dim1), device=dev)
         x = x.to(wg.dtype if n_expts_tot > 1 else x_dtype)
-        # TODO: activate proton here when fast routing is done
+        proton.activate()
         if n_expts_tot > 1:
             logits = matmul_ogs(x, wg, bg, precision_config=pcg)
-            rdata, gather_indx, scatter_indx = routing_torch(logits, n_expts_act)
+            rdata, gather_indx, scatter_indx = routing(logits, n_expts_act)
             if EP > 1:
+                proton.deactivate()
+                # TODO: activate proton here when fast expert parallelism simulation is done
                 m = logits.shape[0] * EP
                 _, rdata, gather_indx, scatter_indx = simulate_expert_sharded_routing(m, rdata, EP, device=dev)
+                proton.activate()
             x = x.to(x_dtype)
         else:
             rdata, gather_indx, scatter_indx = None, None, None
-        proton.activate()
         # c0 = torch.empty((x.shape[0], w1.shape[-1]), device=dev, dtype=x.dtype)
         # c1 = torch.empty((x.shape[0], w2.shape[-1]), device=dev, dtype=x.dtype)
         # cublas.matmul(x, w1.squeeze(0), c0)
@@ -146,5 +148,5 @@ def bench_mlp(batch, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_dtype,
     qxdtype = "fp8" if has_native_mx4 else "bf16"
     print(bench_mlp(8192, 8192, 8192, 1, 1, "fp8", "fp8", TP=1, EP=1, name="dense"))
     print(bench_mlp(8192, 8192, 8192, 1, 1, qxdtype, "mx4", TP=1, EP=1, name="dense"))
-    print(bench_mlp(1024, 5120, 8192, 128, 4, "fp8", "fp8", TP=4, EP=2, name="llama4"))
-    print(bench_mlp(1024, 5120, 8192, 128, 4, qxdtype, "mx4", TP=4, EP=2, name="llama4"))
+    print(bench_mlp(2048, 5120, 8192, 128, 4, "fp8", "fp8", TP=4, EP=1, name="llama4"))
+    print(bench_mlp(2048, 5120, 8192, 128, 4, qxdtype, "mx4", TP=4, EP=1, name="llama4"))
@@ -0,0 +1,89 @@
+import pytest
+import torch
+from triton_bench.routing import routing, routing_torch
+from triton_bench.testing import assert_close
+from triton_bench.matmul_ogs_details.metadata import compute_metadata
+from triton_bench.testing import assert_equal
+
+
+def init_data(n_tokens, n_expts_tot, dtype=torch.float16):
+    dev = "cuda"
+    # the reference implementation and the triton implementation do not tie-break experts the same way
+    randbits = [torch.randperm(n_expts_tot) for _ in range(n_tokens)]
+    x = [(-1)**i * ((16384 + ((i * 512) % 4096) + bits).to(torch.int16).view(dtype)) for i, bits in enumerate(randbits)]
+    return torch.stack(x).to(device=dev)
+
+
+def ref_expt_data(routing_data, n_gates, block_m):
+    hist = routing_data.expt_hist
+    n_expts_tot = routing_data.n_expts_tot
+    blks = (hist + block_m - 1) // block_m  # matmul blocks needed
+    tsum = torch.cumsum(hist, dim=0)  # prefix sum of tokens
+    bsum = torch.cumsum(blks, dim=0)  # prefix sum of blocks
+    # Get the max number of matmul blocks of size d_tile needed (and is launched with).
+    # This assumes the worst distribution of all experts with one token except for one that has the rest.
+    if n_gates <= n_expts_tot:
+        grid_m = n_gates
+    else:
+        # ceil_div(n_gates - n_experts + 1, d_tile) + n_experts - 1
+        # ceil_div(x, y): -(-x // y)
+        grid_m = n_expts_tot - 1 - ((n_expts_tot - n_gates - 1) // block_m)
+    bloc_data = -torch.ones(grid_m, dtype=torch.int32)
+    # compute data required to drive ragged batch matmul
+    for e in range(n_expts_tot):
+        offset = bsum[e - 1] if e else 0
+        for b in range(blks[e]):
+            bloc_data[offset + b] = (b << 16) + e
+
+    expt_data = torch.zeros(n_expts_tot * 3 + 2 + grid_m, dtype=torch.int32, device=hist.device)
+    expt_data[:n_expts_tot] = routing_data.expt_hist
+    expt_data[n_expts_tot + 1:n_expts_tot * 2 + 1] = tsum
+    expt_data[n_expts_tot * 2 + 2:n_expts_tot * 3 + 2] = bsum
+    expt_data[n_expts_tot * 3 + 2:] = bloc_data
+    return expt_data
+
+
+@pytest.mark.parametrize("n_tokens", [371, 255, 256, 8192, 1023, 1024])
+@pytest.mark.parametrize("n_expts_tot, n_expts_act", [(128, 4)])
+@pytest.mark.parametrize("block_m", [64, 128])
+def test_op(n_tokens, n_expts_tot, n_expts_act, block_m):
+    torch.manual_seed(2)
+    tri_logits = init_data(n_tokens, n_expts_tot).detach()
+    ref_logits = tri_logits.clone()
+    ref_routing_data, ref_gather, ref_scatter = routing_torch(ref_logits, n_expts_act)
+    tri_routing_data, tri_gather, tri_scatter = routing(tri_logits, n_expts_act)
+    ref_metadata = ref_expt_data(ref_routing_data, n_tokens * n_expts_act, block_m)
+    tri_metadata = compute_metadata(tri_routing_data, n_tokens * n_expts_act, block_m).buffer
+
+    assert_close(ref_routing_data.gate_scal, tri_routing_data.gate_scal, 2e-2, 4e-3)
+    assert_equal(ref_routing_data.expt_hist, tri_routing_data.expt_hist)
+    assert_equal(ref_metadata, tri_metadata)
+    assert ref_routing_data.n_expts_tot == ref_routing_data.n_expts_tot
+    assert ref_routing_data.n_expts_act == ref_routing_data.n_expts_act
+
+    def _assert_indx_equal(ref, tri):
+        assert_equal(ref, tri[:len(ref)])
+        assert torch.all(tri[len(ref):] == -1)
+
+    _assert_indx_equal(ref_gather.src_indx, tri_gather.src_indx)
+    _assert_indx_equal(ref_gather.dst_indx, tri_gather.dst_indx)
+    _assert_indx_equal(ref_scatter.src_indx, tri_scatter.src_indx)
+    _assert_indx_equal(ref_scatter.dst_indx, tri_scatter.dst_indx)
+
+
+def bench_routing():
+    import triton.profiler as proton
+    n_tokens = 2048
+    block_m = 128
+    n_expts_tot, n_expts_act = 128, 4
+    tri_logits = init_data(n_tokens, n_expts_tot)
+    proton.start("routing")
+    proton.activate()
+    for i in range(100):
+        tri_routing_data, tri_gather, tri_scatter = routing(tri_logits, n_expts_act)
+        tri_metadata = compute_metadata(tri_routing_data, n_tokens * n_expts_act, block_m)
+    proton.finalize()
+
+
+if __name__ == "__main__":
+    bench_routing()
@@ -16,6 +16,7 @@
 )
 from .matmul_ogs_details._p_matmul_ogs import _p_matmul_ogs, get_per_device_per_stream_alloc_fn
 from .matmul_ogs_details.opt_flags import make_opt_flags
+from .matmul_ogs_details.metadata import compute_metadata
 
 # -----------------------------------------------------------------------------
 #                    Matrix Multiplication + Outer Gather/Scatter
@@ -243,7 +244,8 @@ def apply_preprocessing_features(x, w, gather_indx, scatter_indx, routing_data,
         w = w.transpose(-1, -2).contiguous().transpose(-1, -2)
     # preprocess routing information and ptr lookup table
     M = x.shape[1] if gather_indx is None else gather_indx.src_indx.shape[0]
-    expt_data = routing_data.expt_data(M, opt_flags.block_m)
+    # compute expt_data
+    expt_data = compute_metadata(routing_data, M, opt_flags.block_m)
     return x, w, preprocessing_features.swap_xw, writeback_idxs, writeback_size, expt_data
 
 # ---------------------
 
@@ -0,0 +1,83 @@
+from dataclasses import dataclass
+import torch
+import triton
+import triton.language as tl
+
+
+@dataclass
+class ExptData:
+    hist: torch.Tensor
+    offs: torch.Tensor
+    offs_sum: torch.Tensor
+    blocks: torch.Tensor
+    buffer: torch.Tensor
+
+
+@triton.jit
+def _memset_metadata(Metadata, metadata_size, BLOCK: tl.constexpr):
+    pid = tl.program_id(0)
+    offs = pid * BLOCK + tl.arange(0, BLOCK)
+    tl.store(Metadata + offs, 0xffffffff, mask=offs < metadata_size)
+
+
+@triton.jit
+def _compute_metadata(Hist, n_expts_tot, MDHist, MDTokStarts, MDTileStarts, MDTileInfo, N_EXPTS_PAD: tl.constexpr,
+                      BLOCK: tl.constexpr, TILE_DIM: tl.constexpr):
+    expt_id = tl.program_id(0)
+    n_tokens = tl.load(Hist + expt_id)
+    n_blocks = tl.cdiv(n_tokens, TILE_DIM)
+    offs_n = tl.arange(0, N_EXPTS_PAD)
+    mask = offs_n < n_expts_tot
+    hist = tl.load(Hist + offs_n, mask=mask)
+    tile_starts = tl.cumsum(tl.cdiv(hist, TILE_DIM), 0)
+    # first pid to reach this initializes histograms and cumsums
+    if expt_id == 0:
+        tok_starts = tl.cumsum(hist, 0)
+        tl.store(MDHist + offs_n, hist, mask=mask)
+        tl.store(MDTokStarts, 0)
+        tl.store(MDTokStarts + 1 + offs_n, tok_starts, mask=mask)
+        tl.store(MDTileStarts, 0)
+        tl.store(MDTileStarts + 1 + offs_n, tile_starts, mask=mask)
+    tile_off = tl.sum(tl.where(offs_n == expt_id - 1, tile_starts, 0), 0)
+    MDTileInfo += tile_off
+    # MDTileInfo += tl.load(MDTilesStart + expt_id)
+    for block_off in range(0, n_blocks, BLOCK):
+        block_offs = block_off + tl.arange(0, BLOCK)
+        data = (block_offs << 16) + expt_id
+        tl.store(MDTileInfo + block_offs, data, mask=block_offs < n_blocks)
+
+
+def compute_metadata(routing_data, n_rows, block_m):
+    if routing_data.expt_hist is None:
+        return ExptData(None, None, None, None, None)
+    MEMSET_BLOCK = 512
+    HIST2_BLOCK_M = 512
+    device = routing_data.expt_hist.device
+    n_expts_tot = routing_data.n_expts_tot
+    cdiv = triton.cdiv
+    if n_rows <= n_expts_tot:
+        grid_m = n_rows
+    else:
+        grid_m = n_expts_tot - 1 - ((n_expts_tot - n_rows - 1) // block_m)
+    n_expts_pad = cdiv(n_expts_tot, 128) * 128
+    metadata_size = 3 * n_expts_tot + 2 + grid_m
+    metadata = torch.empty(metadata_size, dtype=torch.int32, device=device)
+    md_hist = metadata[:n_expts_tot]
+    md_tok_starts = metadata[n_expts_tot:n_expts_tot * 2 + 1]
+    md_tile_starts = metadata[n_expts_tot * 2 + 1:n_expts_tot * 3 + 2]
+    md_tile_infos = metadata[n_expts_tot * 3 + 2:]
+    _memset_metadata[(cdiv(metadata_size, MEMSET_BLOCK), )](
+        metadata, metadata_size,  # inputs
+        BLOCK=MEMSET_BLOCK  # optimization parameters
+    )
+    _compute_metadata[(n_expts_tot, )](
+        routing_data.expt_hist, n_expts_tot,  # inputs
+        md_hist, md_tok_starts, md_tile_starts, md_tile_infos,  # outputs
+        BLOCK=HIST2_BLOCK_M,  # optimization parameters
+        N_EXPTS_PAD=n_expts_pad, TILE_DIM=block_m,  # constants
+    )
+    hist = metadata[:n_expts_tot]
+    offs = metadata[n_expts_tot:2 * n_expts_tot + 1]
+    offs_sum = metadata[3 * n_expts_tot + 2 - 1]
+    blocks = metadata[n_expts_tot + 2 * (n_expts_tot + 1):]
+    return ExptData(hist, offs, offs_sum, blocks, metadata)