openxla
diff --git a/‎bench/bench/bench_mlp.py‎
Lines changed: 16 additions & 27 deletions b/‎bench/bench/bench_mlp.py‎
Lines changed: 16 additions & 27 deletions
diff --git a/‎bench/tests/test_compact.py‎
Lines changed: 29 additions & 0 deletions b/‎bench/tests/test_compact.py‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎bench/tests/test_matmul.py‎
Lines changed: 4 additions & 17 deletions b/‎bench/tests/test_matmul.py‎
Lines changed: 4 additions & 17 deletions
diff --git a/‎bench/tests/test_routing.py‎
Lines changed: 1 addition & 1 deletion b/‎bench/tests/test_routing.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎bench/triton_bench/__init__.py‎
Lines changed: 7 additions & 0 deletions b/‎bench/triton_bench/__init__.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎bench/triton_bench/compact.py‎
Lines changed: 87 additions & 0 deletions b/‎bench/triton_bench/compact.py‎
Lines changed: 87 additions & 0 deletions
diff --git a/‎bench/triton_bench/matmul_ogs_details/_common.py‎
Lines changed: 1 addition & 1 deletion b/‎bench/triton_bench/matmul_ogs_details/_common.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎bench/triton_bench/matmul_ogs_details/metadata.py‎
Lines changed: 36 additions & 47 deletions b/‎bench/triton_bench/matmul_ogs_details/metadata.py‎
Lines changed: 36 additions & 47 deletions
@@ -1,13 +1,12 @@
 from pathlib import Path
 import json
-import triton
 import triton.profiler as proton
 import torch
 import triton_bench.swiglu
 from triton_bench.mxfp import downcast_to_mxfp
 from triton_bench.matmul_ogs import MicroscalingCtx, matmul_ogs, PrecisionConfig, FlexCtx
 from triton_bench.numerics import InFlexData
-from triton_bench.routing import routing, simulate_expert_sharded_routing
+from triton_bench.routing import routing
 from triton_bench.meta import cuda_capability_geq, is_hip, get_cdna_version
 
 if torch.cuda.is_available() and not is_hip():
@@ -49,7 +48,8 @@ def _query_gpu_specs():
 
 def quantize(w, dtype, dev, **opt):
     if dtype == "bf16":
-        return w.to(torch.bfloat16), InFlexData(), MicroscalingCtx()
+        wq = w.to(torch.bfloat16).transpose(-1, -2).contiguous().transpose(-1, -2)
+        return wq, InFlexData(), MicroscalingCtx()
     elif dtype == "fp8":
         fp8e4_dtype = torch.float8_e4m3fn if get_cdna_version() != 3 \
             else torch.float8_e4m3fnuz
@@ -98,46 +98,35 @@ def bench_mlp(batch, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_dtype,
     # -- benchmark --
     fpath = Path(f"logs/{name}/{batch}-{dim1}-{dim2}-{n_expts_tot}-{n_expts_act}-{x_dtype}-{w_dtype}.hatchet")
     fpath.parent.mkdir(parents=True, exist_ok=True)
-    proton.start(str(fpath.with_suffix('')), hook="triton")
-    proton.deactivate()
-    # run layer
     x_dtype = {"bf16": torch.bfloat16, "fp8": torch.float8_e4m3fn}[x_dtype]
     # special treatment of fp8_e4m3 on AMD CDNA3 because it uses fp8_e4m3fnuz
     if x_dtype == torch.float8_e4m3fn and get_cdna_version() == 3:
         x_dtype = torch.float8_e4m3fnuz
+
+    x = torch.randn((batch, dim1), device=dev)
+    xg = x.to(wg.dtype if n_expts_tot > 1 else x_dtype)
+    x = x.to(x_dtype)
+    # run layer
+    proton.start(str(fpath.with_suffix('')), hook="triton")
     for i in range(100):
-        x = torch.randn((batch, dim1), device=dev)
-        x = x.to(wg.dtype if n_expts_tot > 1 else x_dtype)
-        proton.activate()
         if n_expts_tot > 1:
-            logits = matmul_ogs(x, wg, bg, precision_config=pcg)
-            rdata, gather_indx, scatter_indx = routing(logits, n_expts_act)
-            if EP > 1:
-                proton.deactivate()
-                # TODO: activate proton here when fast expert parallelism simulation is done
-                m = logits.shape[0] * EP
-                _, rdata, gather_indx, scatter_indx = simulate_expert_sharded_routing(m, rdata, EP, device=dev)
-                proton.activate()
-            x = x.to(x_dtype)
+            logits = matmul_ogs(xg, wg, bg, precision_config=pcg)
+            rdata, gather_indx, scatter_indx = routing(logits, n_expts_act, simulated_ep=EP)
         else:
             rdata, gather_indx, scatter_indx = None, None, None
-        # c0 = torch.empty((x.shape[0], w1.shape[-1]), device=dev, dtype=x.dtype)
-        # c1 = torch.empty((x.shape[0], w2.shape[-1]), device=dev, dtype=x.dtype)
-        # TODO: cublas is simply set to None on AMD and may cause this to fail if uncommented
-        # cublas.matmul(x, w1.squeeze(0), c0)
-        # cublas.matmul(c0, w2.squeeze(0), c1)
         x = matmul_ogs(x, w1, b1, rdata, gather_indx=gather_indx, precision_config=pc1)
         x = triton_bench.swiglu.swiglu(x, 1.0, pcs)
         x = matmul_ogs(x, w2, b2, rdata, scatter_indx=scatter_indx, precision_config=pc2)
-        proton.deactivate()
     proton.finalize()
 
     # -- analyze --
     with open(f"{fpath}") as fd:
         data = json.load(fd)
         # TODO: this will be broken if kernels use scopes themselves
         # compute useful (a.k.a. matmul) bytes and flops
-        matmuls = [x for x in data[0]["children"] if "matmul" in x["frame"]["name"]]
+        matmuls = [
+            x for x in data[0]["children"] if "_matmul" in x["frame"]["name"] and "metadata" not in x["frame"]["name"]
+        ]
         tot_bytes = sum([x["metrics"]["bytes"] for x in matmuls])
         tot_flops = {w: sum([x["metrics"].get(f"flops{w}", 0) for x in matmuls]) for w in [8, 16]}
         # compute total time (incl. "not useful" work)
@@ -163,5 +152,5 @@ def bench_mlp(batch, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_dtype,
     qxdtype = "fp8" if has_native_mx4 else "bf16"
     print(bench_mlp(8192, 8192, 8192, 1, 1, "fp8", "fp8", TP=1, EP=1, name="dense"))
     print(bench_mlp(8192, 8192, 8192, 1, 1, qxdtype, "mx4", TP=1, EP=1, name="dense"))
-    print(bench_mlp(2048, 5120, 8192, 128, 4, "fp8", "fp8", TP=4, EP=1, name="llama4"))
-    print(bench_mlp(2048, 5120, 8192, 128, 4, qxdtype, "mx4", TP=4, EP=1, name="llama4"))
+    print(bench_mlp(2048, 5120, 8192, 128, 4, "fp8", "fp8", TP=4, EP=2, name="llama4"))
+    print(bench_mlp(2048, 5120, 8192, 128, 4, qxdtype, "mx4", TP=4, EP=2, name="llama4"))
@@ -0,0 +1,29 @@
+import pytest
+import torch
+from triton_bench.compact import masked_compact, masked_compact_torch
+
+
+@pytest.mark.parametrize("n_tokens, n_cols, k, p", [
+    (8192, 64, 4, 0.5),
+    (8192, 64, 4, 1.0),
+    (131, 128, 16, 0.6),
+    (496, 128, 16, 0.),
+])
+def test_masked_compact(n_tokens, n_cols, k, p):
+    device = "cuda"
+    yi = torch.rand((n_tokens, n_cols), device=device).argsort(dim=-1)
+    yi = yi[:, :k].to(torch.int32)
+    yv = torch.randn((n_tokens, k), dtype=torch.bfloat16, device=device)
+    # "drop" indices from yi with probability `p`
+    mask = torch.zeros((n_tokens, n_cols), dtype=torch.int32, device=device)
+    keep = (torch.rand(yi.shape, device=device) < p)
+    if keep.any():
+        rows = torch.arange(yi.size(0), device=device).unsqueeze(1).expand_as(yi)
+        mask[rows[keep], yi[keep]] = 1
+    chunks = mask.view(*mask.shape[:-1], -1, 32)
+    weights = (1 << torch.arange(32, dtype=torch.int32, device=device))
+    bitmask = (chunks.int() * weights).sum(dim=-1)
+    yv_ref, yi_ref = masked_compact_torch(yv, yi, bitmask)
+    yv_tri, yi_tri = masked_compact(yv, yi, bitmask)
+    assert torch.all(yi_ref == yi_tri)
+    assert torch.all(yv_ref == yv_tri)
@@ -1,11 +1,9 @@
-import itertools
 from dataclasses import dataclass, fields
 import pytest
 import torch
 # benchmarking utilities
-import triton.profiler as proton
 # routing utilities
-from triton_bench.routing import routing_torch, simulate_expert_sharded_routing
+from triton_bench.routing import routing
 # matmul utilities
 import triton_bench.matmul_ogs_details.opt_flags as opt_flags
 from triton_bench.matmul_ogs import FlexCtx, PrecisionConfig, MicroscalingCtx
@@ -43,22 +41,11 @@ def mask_indx(idx, n_expts_act):
 
 def init_routing_data(m, n_expts_tot, n_expts_act, n_expt_shards, do_gather, do_scatter):
     dev = "cuda"
-    logits = torch.randn((m, n_expts_tot), dtype=torch.float32, device=dev, requires_grad=True)
-    routing_data, gather_idx, scatter_idx = routing_torch(logits, n_expts_act)
-    if n_expt_shards > 1:
-        m = logits.shape[0] * n_expt_shards
-        _, routing_data, gather_idx, scatter_idx = simulate_expert_sharded_routing(m, routing_data, n_expt_shards,
-                                                                                   device=logits.device)
+    logits = torch.randn((m, n_expts_tot), dtype=torch.float16, device=dev, requires_grad=True)
+    routing_data, gather_idx, scatter_idx = routing(logits, n_expts_act, simulated_ep=n_expt_shards)
     routing_data.gate_scal = None
     gather_idx = gather_idx if do_gather else None
     scatter_idx = scatter_idx if do_scatter else None
-    if do_gather and do_scatter and n_expts_act == 1 and n_expt_shards == 1:
-        # Compute expt_indx as in routing_torch to access routing_data.expt_hist
-        expt_indx = torch.argsort(-torch.softmax(logits, dim=-1), dim=1,
-                                  stable=True)[:, :n_expts_act].reshape(-1).to(torch.int32)
-        assert (torch.argsort(expt_indx, stable=True) == scatter_idx.dst_indx).all()
-        routing_data.expt_hist[expt_indx[scatter_idx.dst_indx[-n_expts_act:]]] -= 1
-        scatter_idx = mask_indx(scatter_idx, n_expts_act)
     return m, routing_data, gather_idx, scatter_idx
 
 
@@ -315,7 +302,7 @@ def round_x(x, idx):
     scale = lambda val, scal: val if scal is None else val / scal
     if n_expt_shards > 1:
         if not do_scatter:
-            n_rows = rdata.expt_hist[-1].item()
+            n_rows = rdata.expt_hist.sum()
             assert n_rows > 0
             ref_y = ref_y[:n_rows]
             tri_y = tri_y[:n_rows]
 
@@ -73,7 +73,7 @@ def _assert_indx_equal(ref, tri):
 
 def bench_routing():
     import triton.profiler as proton
-    n_tokens = 2048
+    n_tokens = 8192
     block_m = 128
     n_expts_tot, n_expts_act = 128, 4
     tri_logits = init_data(n_tokens, n_expts_tot)
 
@@ -0,0 +1,7 @@
+from dataclasses import dataclass
+
+
+@dataclass
+class Bitmatrix:
+    data: "torch.Tensor"
+    shape: tuple[int]
@@ -0,0 +1,87 @@
+import torch
+import triton
+import triton.language as tl
+from triton_bench import Bitmatrix
+
+
+@triton.jit
+def _masked_compact(Yv, Yi, BitMask, stride_bm, RetYv, RetYi, sentinel, K: tl.constexpr):
+    pid_m = tl.program_id(0)
+    yv = tl.load(Yv + pid_m * K + tl.arange(0, K))
+    yi = tl.load(Yi + pid_m * K + tl.arange(0, K))
+    div = yi // 32
+    rem = yi % 32
+    active_bits = (tl.load(BitMask + pid_m * stride_bm + div) >> rem) & 1
+    exc_cumsum = tl.cumsum(active_bits, 0) - active_bits
+    rev_arange = tl.where(active_bits, 0, K - 1 - tl.arange(0, K))
+    write_indx = exc_cumsum + rev_arange
+    yv = tl.where(active_bits, yv, sentinel)
+    yi = tl.where(active_bits, yi, sentinel)
+    tl.store(RetYv + pid_m * K + write_indx, yv)
+    tl.store(RetYi + pid_m * K + write_indx, yi)
+
+
+def masked_compact(yv, yi, bitmask, sentinel=-1):
+    """
+    Return compacted copies of *yv* and *yi* based on a per-row bitmask.
+
+    Only the elements whose index appears among the active bits of *bitmask*
+    are kept; the rest are replaced by *sentinel*.  Kept elements preserve
+    their original left-to-right order.
+
+    Parameters
+    ----------
+    yv : torch.Tensor, shape (B, K)
+        Values tensor.
+    yi : torch.Tensor, shape (B, K), dtype torch.long
+        Integer indices (0 ≤ index < 32) associated with *yv*.
+    bitmask : torch.Tensor, shape (B,) **or** (B, 32)
+        Per-row mask of active indices.  See the in-place version for details.
+    sentinel : int, default -1
+        Value written into dropped positions of the returned tensors.
+
+    Returns
+    -------
+    (yv_out, yi_out) : Tuple[torch.Tensor, torch.Tensor], each shape (B, K)
+        New tensors with the same dtype/device as the inputs.
+
+    """
+
+    n_rows, n_cols = yi.shape
+    ret_yv = torch.empty_like(yv)
+    ret_yi = torch.empty_like(yi)
+    if isinstance(bitmask, Bitmatrix):
+        bitmask = bitmask.data
+
+    _masked_compact[(n_rows, )](
+        yv, yi, bitmask, bitmask.stride(0),  # inputs
+        ret_yv, ret_yi,  # outputs
+        sentinel,  # sentinel
+        K=n_cols  # constants
+    )
+    return ret_yv, ret_yi
+
+
+def masked_compact_torch(yv: torch.Tensor, yi: torch.Tensor, bitmask: torch.Tensor, sentinel=-1):
+    """
+    reference implementation of `masked_compact`
+    """
+    B, K = yi.shape
+    device, dtype = yi.device, yi.dtype
+    # Expand bitmask to a boolean matrix of active bits  (B, 32)
+    w = (1 << torch.arange(32, device=device, dtype=bitmask.dtype))
+    bits = (bitmask.unsqueeze(-1) & w) != 0
+    mask = bits.flatten(start_dim=-2)  # or bits.reshape(B, -1)
+    # For every yi element decide whether it should be kept
+    keep = mask.gather(1, yi.long())
+    # Build a stable permutation that brings all "keep" items forward
+    #    False→0, True→1  ==> invert so kept==0, dropped==1, then argsort
+    order = (~keep).to(torch.int).argsort(dim=1, stable=True)
+    # Re‑order tensors according to above permutation
+    yi_sorted = yi.gather(1, order)
+    yv_sorted = yv.gather(1, order)
+    # fill relevant positions with sentinel
+    keep_sorted = keep.gather(1, order)
+    yi_sorted[~keep_sorted] = sentinel
+    yv_sorted[~keep_sorted] = sentinel
+    return yv_sorted, yi_sorted
@@ -91,5 +91,5 @@ def matmul_launch_metadata(grid, kernel, args):
     sindx = args.get("WriteBackIndx", None)
     if sindx is not None:
         skipped = (sindx == -1).sum() / sindx.numel()
-    ret["bytes"] = ((1 - skipped) * Y.numel() * Y.element_size() + X.numel() * X.element_size() + n_w_bytes)
+    ret["bytes"] = int((1 - skipped) * Y.numel() * Y.element_size() + X.numel() * X.element_size() + n_w_bytes)
     return ret
@@ -14,41 +14,35 @@ class ExptData:
 
 
 @triton.jit
-def _memset_metadata(Metadata, metadata_size, BLOCK: tl.constexpr):
+def _matmul_metadata_memset(Hist, n_expts_tot, MDHist, MDTokStarts, MDTileStarts, MDTileInfo, md_n_tiles,
+                            BLOCK: tl.constexpr, TILE_DIM: tl.constexpr):
     pid = tl.program_id(0)
+    # if pid == 0 - initialize cumsums
+    if pid == 0:
+        x_tok = tl.zeros([BLOCK], dtype=MDTokStarts.dtype.element_ty)
+        x_tile = tl.zeros([BLOCK], dtype=MDTileStarts.dtype.element_ty)
+        tl.store(MDTokStarts, 0)
+        tl.store(MDTileStarts, 0)
+        for i in range(0, n_expts_tot, BLOCK):
+            offs_n = tl.arange(0, BLOCK) + i
+            mask = offs_n < n_expts_tot
+            hist_tok = tl.load(Hist + offs_n, mask=mask)
+            hist_tile = tl.cdiv(hist_tok, TILE_DIM)
+            tok_starts = tl.cumsum(hist_tok, 0) + x_tok
+            x_tok += tl.sum(hist_tok, 0).to(MDTokStarts.dtype.element_ty)
+            tile_starts = tl.cumsum(hist_tile, 0) + x_tile
+            x_tile += tl.sum(hist_tile, 0).to(MDTileStarts.dtype.element_ty)
+            tl.store(MDHist + offs_n, hist_tok, mask=mask)
+            tl.store(MDTokStarts + 1 + offs_n, tok_starts, mask=mask)
+            tl.store(MDTileStarts + 1 + offs_n, tile_starts, mask=mask)
+
+    # initialize block data
     offs = pid * BLOCK + tl.arange(0, BLOCK)
-    tl.store(Metadata + offs, 0xffffffff, mask=offs < metadata_size)
+    tl.store(MDTileInfo + offs, 0xffffffff, mask=offs < md_n_tiles)
 
 
 @triton.jit
-def _compute_metadata_1(Hist, n_expts_tot, MDHist, MDTokStarts, MDTileStarts, MDTileInfo, N_EXPTS_PAD: tl.constexpr,
-                        BLOCK: tl.constexpr, TILE_DIM: tl.constexpr):
-
-    BLOCK_N: tl.constexpr = 1024
-
-    x_tok = tl.zeros([BLOCK_N], dtype=MDTokStarts.dtype.element_ty)
-    x_tile = tl.zeros([BLOCK_N], dtype=MDTileStarts.dtype.element_ty)
-
-    tl.store(MDTokStarts, 0)
-    tl.store(MDTileStarts, 0)
-
-    for i in range(0, n_expts_tot, BLOCK_N):
-        offs_n = tl.arange(0, BLOCK_N) + i
-        mask = offs_n < n_expts_tot
-        hist_tok = tl.load(Hist + offs_n, mask=mask)
-        hist_tile = tl.cdiv(hist_tok, TILE_DIM)
-        tok_starts = tl.cumsum(hist_tok, 0) + x_tok
-        x_tok += tl.sum(hist_tok, 0)
-        tile_starts = tl.cumsum(hist_tile, 0) + x_tile
-        x_tile += tl.sum(hist_tile, 0)
-        tl.store(MDHist + offs_n, hist_tok, mask=mask)
-        tl.store(MDTokStarts + 1 + offs_n, tok_starts, mask=mask)
-        tl.store(MDTileStarts + 1 + offs_n, tile_starts, mask=mask)
-
-
-@triton.jit
-def _compute_metadata_2(Hist, n_expts_tot, MDHist, MDTokStarts, MDTileStarts, MDTileInfo, N_EXPTS_PAD: tl.constexpr,
-                        BLOCK: tl.constexpr, TILE_DIM: tl.constexpr):
+def _matmul_metadata_compute(Hist, MDTileStarts, MDTileInfo, BLOCK: tl.constexpr, TILE_DIM: tl.constexpr):
 
     expt_id = tl.program_id(0)
     n_tokens = tl.load(Hist + expt_id)
@@ -75,26 +69,21 @@ def compute_metadata(routing_data, n_rows, block_m):
         grid_m = n_rows
     else:
         grid_m = n_expts_tot - 1 - ((n_expts_tot - n_rows - 1) // block_m)
-    n_expts_pad = cdiv(n_expts_tot, 128) * 128
     metadata_size = 3 * n_expts_tot + 2 + grid_m
     metadata = torch.empty(metadata_size, dtype=torch.int32, device=device)
     md_hist = metadata[:n_expts_tot]
-    md_tok_starts = metadata[n_expts_tot:n_expts_tot * 2 + 1]
+    md_offs = metadata[n_expts_tot:n_expts_tot * 2 + 1]
+    md_offs_sum = metadata[3 * n_expts_tot + 2 - 1]
     md_tile_starts = metadata[n_expts_tot * 2 + 1:n_expts_tot * 3 + 2]
     md_tile_infos = metadata[n_expts_tot * 3 + 2:]
-    _memset_metadata[(cdiv(metadata_size, MEMSET_BLOCK), )](
-        metadata, metadata_size,  # inputs
-        BLOCK=MEMSET_BLOCK  # optimization parameters
+    _matmul_metadata_memset[(cdiv(metadata_size, MEMSET_BLOCK), )](
+        routing_data.expt_hist, n_expts_tot, md_hist, md_offs, md_tile_starts, md_tile_infos, md_tile_infos.shape[0],
+        BLOCK=MEMSET_BLOCK,  # optimization parameters
+        TILE_DIM=block_m,  # constants
+    )
+    _matmul_metadata_compute[(n_expts_tot, )](
+        routing_data.expt_hist, md_tile_starts, md_tile_infos,  # outputs
+        BLOCK=HIST2_BLOCK_M,  # optimization parameters
+        TILE_DIM=block_m,  # constants
     )
-    for kernel, num_blocks in [(_compute_metadata_1, 1), (_compute_metadata_2, n_expts_tot)]:
-        kernel[(num_blocks, )](
-            routing_data.expt_hist, n_expts_tot,  # inputs
-            md_hist, md_tok_starts, md_tile_starts, md_tile_infos,  # outputs
-            BLOCK=HIST2_BLOCK_M,  # optimization parameters
-            N_EXPTS_PAD=n_expts_pad, TILE_DIM=block_m,  # constants
-        )
-    hist = metadata[:n_expts_tot]
-    offs = metadata[n_expts_tot:2 * n_expts_tot + 1]
-    offs_sum = metadata[3 * n_expts_tot + 2 - 1]
-    blocks = metadata[n_expts_tot + 2 * (n_expts_tot + 1):]
-    return ExptData(hist, offs, offs_sum, blocks, metadata)
+    return ExptData(md_hist, md_offs, md_offs_sum, md_tile_infos, metadata)