intel
diff --git a/‎python/triton_kernels/bench/distributed.py‎
Lines changed: 41 additions & 3 deletions b/‎python/triton_kernels/bench/distributed.py‎
Lines changed: 41 additions & 3 deletions
diff --git a/‎python/triton_kernels/tests/conftest.py‎
Lines changed: 9 additions & 0 deletions b/‎python/triton_kernels/tests/conftest.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎python/triton_kernels/tests/test_matmul.py‎
Lines changed: 19 additions & 45 deletions b/‎python/triton_kernels/tests/test_matmul.py‎
Lines changed: 19 additions & 45 deletions
diff --git a/‎python/triton_kernels/tests/test_routing.py‎
Lines changed: 2 additions & 5 deletions b/‎python/triton_kernels/tests/test_routing.py‎
Lines changed: 2 additions & 5 deletions
diff --git a/‎python/triton_kernels/triton_kernels/matmul_ogs.py‎
Lines changed: 2 additions & 2 deletions b/‎python/triton_kernels/triton_kernels/matmul_ogs.py‎
Lines changed: 2 additions & 2 deletions
@@ -18,12 +18,10 @@
     ScatterIndx,
     compute_expt_data_torch,
     topk_torch,
-    prune_routing,
     routing_from_bitmatrix,
 )
 from triton_kernels.topk import topk
 from triton_kernels.matmul_ogs import matmul_ogs, PrecisionConfig, FlexCtx, FnSpecs, FusedActivation
-from triton_kernels.routing_details._routing_compute import _routing_clear_bitmatrix
 from triton_kernels.target_info import get_cdna_version, is_hip, is_cuda, cuda_capability_geq
 from triton_kernels.tensor_details import layout
 from triton_kernels.tensor import Bitmatrix
@@ -291,6 +289,46 @@ def pack_bitmatrix(
         tl.store(bitmatrix_ptrs, y, mask=offsets_m[:, None] < n_rows)
 
 
+@triton.jit
+def _routing_clear_bitmatrix(Bitmatrix, stride_bm, stride_bn, shape_bn, cutoff, BLOCK_N: tl.constexpr):
+    pid_m = tl.program_id(0)
+    cutoff_word = cutoff // 32
+    cutoff_bit = cutoff % 32
+    cutoff_mask = (1 << (cutoff_bit)) - 1
+    for start_n in range(0, shape_bn, BLOCK_N):
+        offs_n = start_n + tl.arange(0, BLOCK_N)
+        values = tl.load(Bitmatrix + pid_m * stride_bm + offs_n * stride_bn, mask=offs_n < shape_bn)
+        values = tl.where(offs_n == cutoff_word, values & cutoff_mask, values)
+        values = tl.where(offs_n > cutoff_word, 0, values)
+        tl.store(Bitmatrix + pid_m * stride_bm + offs_n * stride_bn, values, mask=offs_n < shape_bn)
+
+
+class PruneRouting(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, expt_scal, expt_indx, bitmatrix, n_expts_tot, simulated_ep):
+        from triton_kernels.compaction import compaction
+        n_tokens_pad = expt_scal.shape[0]
+        assert n_expts_tot % simulated_ep == 0
+        _routing_clear_bitmatrix[(n_tokens_pad, )](
+            bitmatrix.storage.data,
+            bitmatrix.storage.data.stride(0),
+            bitmatrix.storage.data.stride(1),
+            bitmatrix.storage.data.shape[1],
+            n_expts_tot // simulated_ep,
+            BLOCK_N=512,
+        )
+        # perform compaction to update expt_scal / expt_indx
+        expt_scal, expt_indx = compaction(expt_scal, expt_indx, bitmatrix)
+        n_expts_tot = n_expts_tot // simulated_ep
+        bitmatrix.shape[-1] = n_expts_tot
+        return expt_scal, expt_indx, bitmatrix
+
+
+def prune_routing(expt_scal, expt_indx, bitmatrix, n_expts_tot, simulated_ep):
+    return PruneRouting.apply(expt_scal, expt_indx, bitmatrix, n_expts_tot, simulated_ep)
+
+
 def routing_triton(x, logits, n_expts_act, sm_first=False, expt_indx=None, n_rows=None, EP=1, TP=1):
     _, n_expts_tot = logits.shape
 
@@ -354,7 +392,7 @@ def routing(x, logits, n_expts_act, sm_first=False, expt_indx=None, n_rows=None,
         else:
             raise ValueError(f"Unknown backend: {backend}")
     else:
-        return x, *triton_kernels.routing.routing(logits, n_expts_act, sm_first, expt_indx, EP, n_rows), None
+        return x, *triton_kernels.routing.routing(logits, n_expts_act, sm_first, expt_indx, n_rows), None
 
 
 # The following dummy methods simulate the behavior of distributed operations
 
@@ -1,5 +1,6 @@
 import pytest
 import tempfile
+import os
 
 
 def pytest_addoption(parser):
@@ -29,3 +30,11 @@ def fresh_triton_cache():
         with knobs.cache.scope(), knobs.runtime.scope():
             knobs.cache.dir = tmpdir
             yield tmpdir
+
+
+def pytest_configure(config):
+    worker_id = os.environ.get("PYTEST_XDIST_WORKER")
+    if worker_id is not None and worker_id.startswith("gw"):
+        import torch
+        gpu_id = int(worker_id[2:])  # map gw0 → 0, gw1 → 1, ...
+        os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id % torch.cuda.device_count())
@@ -45,19 +45,16 @@ def mask_indx(idx, n_expts_act):
     return idx
 
 
-def init_routing_data(m, n_expts_tot, n_expts_act, n_expt_shards, do_gather, do_scatter, device="cuda"):
+def init_routing_data(m, n_expts_tot, n_expts_act, do_gather, do_scatter, device="cuda"):
     logits = torch.randn((m, n_expts_tot), dtype=torch.float16, device=device, requires_grad=True)
-    routing_data, gather_idx, scatter_idx = routing(logits, n_expts_act, simulated_ep=n_expt_shards)
+    routing_data, gather_idx, scatter_idx = routing(logits, n_expts_act)
     routing_data.gate_scal = None
     gather_idx = gather_idx if do_gather else None
     scatter_idx = scatter_idx if do_scatter else None
-    # TODO: re-enable
-    # if do_gather and do_scatter and n_expts_act == 1 and n_expt_shards == 1:
-    #     scatter_idx = mask_indx(scatter_idx, n_expts_act)
     return m, routing_data, gather_idx, scatter_idx
 
 
-def init_compute_data(m, n, k, rdata, gindx, sindx, n_expts_tot, n_expts_act, n_expt_shards, mode, act_dtype, weight_dtype,
+def init_compute_data(m, n, k, rdata, gindx, sindx, n_expts_tot, n_expts_act, mode, act_dtype, weight_dtype,
                       has_y_gammas, requires_grad=True, device="cuda",
                       inner_expt_opt=None, padding_block_k=None):
     torch.manual_seed(0)
@@ -70,7 +67,7 @@ def init_compute_data(m, n, k, rdata, gindx, sindx, n_expts_tot, n_expts_act, n_
     else:
         in_m = m * (n_expts_act if gindx is None else 1)
     shape_x = (n_expts_tot, in_m, k) if mode == 'batched' else (in_m, k)
-    shape_batch = tuple() if (mode == "plain" or inner_expt_opt is not None) else (n_expts_tot // n_expt_shards, )
+    shape_batch = tuple() if (mode == "plain" or inner_expt_opt is not None) else (n_expts_tot, )
     x = alloc_rand(shape_x, device=device, dtype=act_dtype, requires_grad=requires_grad)
     w = alloc_rand(shape_batch + (k, n), device=device, dtype=weight_dtype, requires_grad=requires_grad)
     bias = alloc_rand(shape_batch + (n, ), device=device, dtype=torch.float32, requires_grad=requires_grad)
@@ -194,7 +191,6 @@ class Case:
     weight_dtype_str: str
     n_expts_tot: int = 1
     n_expts_act: int = 1
-    n_expt_shards: int = 1
     split_k: int = 1
     hbm_swizzling: bool = False
     epilogue_subtile: Union[int, None] = None
@@ -216,10 +212,6 @@ class Case:
             Case(5, 7, 0, "batched", "float16", "float16"),
             # Non-mx types:
             Case(16, 256, 256, "ragged", "float16", "float16", 128, 4),
-            Case(16, 256, 256, "ragged", "float16", "float16", 128, 4, n_expt_shards=2),
-            Case(16, 256, 256, "ragged", "float16", "float16", 128, 4, n_expt_shards=4),
-            Case(400, 300, 500, "ragged", "float16", "float16", 32, 4, n_expt_shards=4),
-            Case(16, 256, 256, "ragged", "float16", "float16", 4, 1, n_expt_shards=2),
             Case(16, 256, 256, "ragged", "float16", "float16", 128, 4, split_k=3),
             Case(16, 256, 256, "ragged", "float16", "float16", 128, 4, split_k=3),
             Case(300, 400, 400, "batched", "float8_e5m2", "float8_e5m2", 5, 1),
@@ -235,8 +227,6 @@ class Case:
             Case(600, 400, 400, "ragged", "float8_e5m2", "float8_e5m2", 4, 2, epilogue_subtile=2),
             Case(600, 400, 400, "ragged", "float8_e5m2", "float8_e5m2", 4, 2, epilogue_subtile=4),
             Case(600, 400, 400, "ragged", "float8_e5m2", "float8_e5m2", 4, 2),
-            Case(600, 400, 400, "ragged", "float8_e5m2", "float8_e5m2", 4, 2, n_expt_shards=2),
-            Case(600, 400, 400, "ragged", "float8_e5m2", "float8_e5m2", 4, 1, n_expt_shards=2),
             Case(600, 400, 400, "ragged", "float8_e5m2", "float8_e5m2", 4, 2, split_k=2),
             Case(1000, 400, 400, "ragged", "float16", "float16", 3, 1),
             Case(1000, 700, 700, "ragged", "float16", "float16", 8, 2),
@@ -291,19 +281,17 @@ class Case:
             Case(300, 400, 400, "ragged", "float8_e4m3fnuz", "float8_e4m3fnuz"),
             Case(1000, 400, 400, "ragged", "float8_e4m3fnuz", "float8_e4m3fnuz", 3, 1),
             Case(600, 400, 400, "ragged", "float8_e4m3fnuz", "float8_e4m3fnuz", 4, 2),
-            Case(600, 400, 400, "ragged", "float8_e4m3fnuz", "float8_e4m3fnuz", 4, 2, n_expt_shards=2),
             Case(600, 400, 400, "ragged", "float8_e4m3fnuz", "float8_e4m3fnuz", 4, 2, split_k=2),
             Case(300, 400, 400, "ragged", "float8_e4m3fn", "float8_e4m3fn"),
             Case(1000, 400, 400, "ragged", "float8_e4m3fn", "float8_e4m3fn", 3, 1),
             Case(600, 400, 400, "ragged", "float8_e4m3fn", "float8_e4m3fn", 4, 2),
-            Case(600, 400, 400, "ragged", "float8_e4m3fn", "float8_e4m3fn", 4, 2, n_expt_shards=2),
         ] + [
-            Case(320, 400, 400, mode, dtype, dtype, n_expts_tot, n_expts_act, n_expt_shards=n_expt_shards,
+            Case(320, 400, 400, mode, dtype, dtype, n_expts_tot, n_expts_act,
                  x_transpose=x_transpose, w_transpose=w_transpose, y_transpose=y_transpose)
-            for (mode, n_expts_tot, n_expts_act, n_expt_shards) in (
-                ("batched", 1, 1, 1),
-                ("ragged", 8, 4, 1),
-                ("ragged", 32, 4, 4),
+            for (mode, n_expts_tot, n_expts_act) in (
+                ("batched", 1, 1),
+                ("ragged", 8, 4),
+                ("ragged", 32, 4),
             )
             for dtype in ("float16", "float8_e5m2")
             for x_transpose in (False, True)
@@ -326,7 +314,7 @@ class Case:
 @pytest.mark.parametrize("has_y_gammas", [False, True])
 @pytest.mark.parametrize("is_persistent", [False, True])
 def test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, inner_expt_opt, has_y_gammas, is_persistent, n_expts_tot,
-            n_expts_act, n_expt_shards, mode, act_dtype_str, weight_dtype_str, block_m, hbm_swizzling, epilogue_subtile,
+            n_expts_act, mode, act_dtype_str, weight_dtype_str, block_m, hbm_swizzling, epilogue_subtile,
             x_transpose, w_transpose, y_transpose,
             device, opt_flags_scope, fresh_knobs):
     # TODO: remove when Triton FP8 supports proper RTNE
@@ -424,17 +412,17 @@ def test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, inner_expt_o
     weight_dtype = dtype_str_to_torch(weight_dtype_str)
     act_dtype = dtype_str_to_torch(act_dtype_str)
     precision_opt = init_precision(act_dtype, act_is_float8, weight_dtype, weight_mxfp,
-                                   n_expts_tot // n_expt_shards, expt_is_inner, device=device)
+                                   n_expts_tot, expt_is_inner, device=device)
     # precision_opt.x_pad_trans_requires_flexpoint = False
     if mode == "ragged":
-        m, rdata, gindx, sindx = init_routing_data(m, n_expts_tot, n_expts_act, n_expt_shards, do_gather, do_scatter,
+        m, rdata, gindx, sindx = init_routing_data(m, n_expts_tot, n_expts_act, do_gather, do_scatter,
                                                    device=device)
     else:
         rdata = gindx = sindx = None
 
     padding_block_k = 32
     x_tri, w_tri, bias_tri, gs0_tri, gs1_tri = init_compute_data(m, n, k, rdata, gindx, sindx, n_expts_tot, n_expts_act,
-                                                                 n_expt_shards, mode, torch.bfloat16 if act_mxfp8 else act_dtype,  #
+                                                                 mode, torch.bfloat16 if act_mxfp8 else act_dtype,  #
                                                                  torch.bfloat16 if weight_mxfp else weight_dtype,
                                                                  has_y_gammas, requires_grad=test_bwd, device=device,
                                                                  inner_expt_opt=inner_expt_opt, padding_block_k=padding_block_k)
@@ -446,9 +434,9 @@ def test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, inner_expt_o
         w_tri = w_tri.detach().transpose(-1, -2).contiguous().transpose(-1, -2).requires_grad_(test_bwd)
     if y_transpose:
         if mode == "batched":
-            yT_shape = (n_expts_tot // n_expt_shards, n, x_tri.shape[-2])
+            yT_shape = (n_expts_tot, n, x_tri.shape[-2])
         elif expt_is_inner:
-            yT_shape = (n_expts_tot // n_expt_shards, n, k)
+            yT_shape = (n_expts_tot, n, k)
         elif sindx is not None:
             yT_shape = (n, m)
         else:
@@ -549,20 +537,6 @@ def scale(val, scal):
             assert val.ndim == 3
             return val / scal[:, None, None]
 
-    if n_expt_shards > 1:
-        if do_scatter:
-            indx = sindx.dst_indx[sindx.dst_indx != -1]
-            ref_y = ref_y[indx // n_expts_act, :]
-            if act_is_float8:
-                tri_y = tri_y.view(torch.int8)
-            tri_y = tri_y[indx // n_expts_act, :]
-            if act_is_float8:
-                tri_y = tri_y.view(act_dtype)
-        elif not expt_is_inner:
-            n_rows = rdata.expt_hist.sum()
-            assert n_rows > 0
-            ref_y = ref_y[:n_rows]
-            tri_y = tri_y[:n_rows]
     if act_mxfp8:
         tri_y = upcast_from_mxfp(tri_y, precision_opt.out_scale, target_dtype=torch.bfloat16, axis=-1).to(ref_y.dtype)
         ref_y_quant, ref_y_scale = downcast_to_mxfp_torch(ref_y, act_dtype, axis=-1)
@@ -683,18 +657,18 @@ def test_fused_act(m, n, k, mode, split_k, do_gather, do_scatter, fused_scatter,
         "split_k": split_k,
         "fused_scatter": fused_scatter,
     }
-    n_expts_tot, n_expts_act, n_expt_shards = 1, 1, 1
+    n_expts_tot, n_expts_act = 1, 1
     opt_flags.update_opt_flags_constraints(constraints)
 
     weight_dtype, act_dtype = torch.float16, torch.float16
     if mode == "ragged":
-        m, rdata, gindx, sindx = init_routing_data(m, n_expts_tot, n_expts_act, n_expt_shards, do_gather, do_scatter,
+        m, rdata, gindx, sindx = init_routing_data(m, n_expts_tot, n_expts_act, do_gather, do_scatter,
                                                    device=device)
     else:
         rdata = gindx = sindx = None
 
-    precision_opt = init_precision(act_dtype, str(act_dtype).startswith("torch.float8"), weight_dtype, False, n_expts_tot // n_expt_shards, device=device)
-    x, w, bias, _, _ = init_compute_data(m, n, k, rdata, gindx, sindx, n_expts_tot, n_expts_act, n_expt_shards, mode,
+    precision_opt = init_precision(act_dtype, str(act_dtype).startswith("torch.float8"), weight_dtype, False, n_expts_tot, device=device)
+    x, w, bias, _, _ = init_compute_data(m, n, k, rdata, gindx, sindx, n_expts_tot, n_expts_act, mode,
                                          act_dtype, weight_dtype, False, requires_grad=False, device=device)
 
     if mode == "batched":
 
@@ -55,11 +55,8 @@ def _assert_indx_equal(ref, tri):
     tri_expt_data = tri_routing_data.expt_data
     assert_equal(ref_expt_data.hist, tri_expt_data.hist)
     assert_equal(ref_expt_data.token_offs_raw, tri_expt_data.token_offs_raw)
-    assert len(ref_expt_data.token_offs_pad) == len(tri_expt_data.token_offs_pad)
-    assert len(ref_expt_data.block_pid_map) == len(tri_expt_data.block_pid_map)
-    for block_m in ref_expt_data.token_offs_pad.keys():
-        assert_equal(ref_expt_data.token_offs_pad[block_m], tri_expt_data.token_offs_pad[block_m])
-        assert_equal(ref_expt_data.block_pid_map[block_m], tri_expt_data.block_pid_map[block_m])
+    assert_equal(ref_expt_data.token_offs_pad_data, tri_expt_data.token_offs_pad_data)
+    assert_equal(ref_expt_data.block_pid_map_data, tri_expt_data.block_pid_map_data)
 
     assert ref_routing_data.n_expts_tot == ref_routing_data.n_expts_tot
     assert ref_routing_data.n_expts_act == ref_routing_data.n_expts_act
 
@@ -169,8 +169,8 @@ def make_kernel_args(data, block_m):
         return (
             expt_data.hist,
             expt_data.token_offs_raw,
-            expt_data.token_offs_pad[block],
-            expt_data.block_pid_map[block],
+            expt_data.token_offs_pad(block),
+            expt_data.block_pid_map(block),
         ) + args