[BENCH] Bitmatrix refactor (#6883)

apgoucher · web-flow · commit 9b13c1c6d98a · 2025-05-20T08:14:04.000-07:00
This gives another 10% speed improvement to routing
diff --git a/python/triton_kernels/triton_kernels/__init__.py b/python/triton_kernels/triton_kernels/__init__.py
@@ -1,7 +0,0 @@
-from dataclasses import dataclass
-
-
-@dataclass
-class Bitmatrix:
-    data: "torch.Tensor"  # noqa: F821
-    shape: tuple[int]
diff --git a/python/triton_kernels/triton_kernels/bitmatrix.py b/python/triton_kernels/triton_kernels/bitmatrix.py
@@ -0,0 +1,33 @@
+from dataclasses import dataclass
+
+import torch
+
+from .reduction_details.reduce_bitmatrix import clear_sums, sum_bitmatrix_rows
+
+
+@dataclass
+class Bitmatrix:
+    """
+    Represents a boolean matrix in a packed format where each element occupies
+    a single bit of memory.
+
+    We use a Bitmatrix to represent the routing information, where each row
+    corresponds to a token and each column corresponds to an expert.
+
+    S is either None or an all-zero array of size >= n_cols; we pass it along
+    with the actual bitmatrix to avoid having to launch a separate memset
+    kernel when we call Bitmatrix::sum().
+    """
+
+    data: torch.Tensor
+    shape: tuple[int]
+    S: torch.tensor
+
+    def sum(self, partials_block_size):
+        n_rows, n_cols = self.shape
+        dev = self.data.device
+        if self.S is None:
+            self.S = clear_sums(n_cols, dev)
+        out_ret = self.S[:n_cols]
+        self.S = None  # throw error if we try to sum again
+        return sum_bitmatrix_rows(self, out_ret, partials_block_size)
diff --git a/python/triton_kernels/triton_kernels/compaction.py b/python/triton_kernels/triton_kernels/compaction.py
@@ -1,6 +1,6 @@
 import torch
 from .compaction_details._masked_compaction import _masked_compaction
-from triton_kernels import Bitmatrix
+from .bitmatrix import Bitmatrix
 
 
 def compaction(yv, yi, bitmask, sentinel=-1):
@@ -36,7 +36,7 @@ def compaction(yv, yi, bitmask, sentinel=-1):
         bitmask = bitmask.data
 
     _masked_compaction[(n_rows, )](
-        yv, yi, bitmask, bitmask.stride(0),  # inputs
+        yv, yi, bitmask, bitmask.stride(0), bitmask.stride(1),  # inputs
         ret_yv, ret_yi,  # outputs
         sentinel,  # sentinel
         K=n_cols  # constants
diff --git a/python/triton_kernels/triton_kernels/compaction_details/_masked_compaction.py b/python/triton_kernels/triton_kernels/compaction_details/_masked_compaction.py
@@ -3,13 +3,13 @@
 
 
 @triton.jit
-def _masked_compaction(Yv, Yi, BitMask, stride_bm, RetYv, RetYi, sentinel, K: tl.constexpr):
+def _masked_compaction(Yv, Yi, BitMask, stride_bm, stride_bn, RetYv, RetYi, sentinel, K: tl.constexpr):
     pid_m = tl.program_id(0)
     yv = tl.load(Yv + pid_m * K + tl.arange(0, K))
     yi = tl.load(Yi + pid_m * K + tl.arange(0, K))
     div = yi // 32
     rem = yi % 32
-    active_bits = (tl.load(BitMask + pid_m * stride_bm + div) >> rem) & 1
+    active_bits = (tl.load(BitMask + pid_m * stride_bm + div * stride_bn) >> rem) & 1
     exc_cumsum = tl.cumsum(active_bits, 0) - active_bits
     rev_arange = tl.where(active_bits, 0, K - 1 - tl.arange(0, K))
     write_indx = exc_cumsum + rev_arange
diff --git a/python/triton_kernels/triton_kernels/reduction.py b/python/triton_kernels/triton_kernels/reduction.py
diff --git a/python/triton_kernels/triton_kernels/reduction_details/reduce_bitmatrix.py b/python/triton_kernels/triton_kernels/reduction_details/reduce_bitmatrix.py
@@ -1,3 +1,4 @@
+import torch
 import triton
 import triton.language as tl
 
@@ -42,49 +43,65 @@ def vpopc(x):
 
 
 @triton.jit
-def _sum_bitmatrix_memset(Ret, ret_size, BLOCK: tl.constexpr):
+def _sum_bitmatrix_memset(Ret, BLOCK: tl.constexpr):
     pid = tl.program_id(0)
     offs = pid * BLOCK + tl.arange(0, BLOCK)
-    tl.store(Ret + offs, 0, mask=offs < ret_size)
+    tl.store(Ret + offs, 0)
 
 
 @triton.jit
-def _sum_bitmatrix_rows(B, shape_bm, stride_bm,  # input bitmatrix
-                        Ret, Partials, stride_pm, shape_pn,  # outputs
-                        BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr):
-    tl.static_assert(BLOCK_N % 32 == 0)
+def _sum_bitmatrix_rows(B, shape_bm, stride_bm: tl.constexpr, stride_bn: tl.constexpr,  # input bitmatrix
+                        Ret, Partials, stride_pm: tl.constexpr, stride_pn, shape_pn,  # outputs
+                        BLOCK_MM: tl.constexpr, BLOCK_M: tl.constexpr):
+
+    tl.static_assert(BLOCK_MM % BLOCK_M == 0)
+    TILE_SIZE: tl.constexpr = BLOCK_MM // BLOCK_M
     pid_m = tl.program_id(0)
     pid_n = tl.program_id(1)
-    BLOCK_B: tl.constexpr = BLOCK_N // 32
-    offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
-    offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
-    offs_b = pid_n * BLOCK_B + tl.arange(0, BLOCK_B)
-    bits = tl.load(B + offs_m[None, :] * stride_bm + offs_b[:, None], mask=offs_m[None, :] < shape_bm)
-    ret = tl.reshape(vpopc(bits), [BLOCK_N])
-    mask = offs_n < shape_pn
-    tl.atomic_add(Ret + offs_n, ret, mask=mask, sem="relaxed")
-    tl.store(Partials + pid_m * stride_pm + offs_n, ret, mask=mask)
-
-
-def sum_bitmatrix_rows(x, out_ret, out_partials, partials_block_size=None):
+    offs_m = pid_m * BLOCK_MM + tl.arange(0, BLOCK_MM)
+    offs_n = pid_n * 32 + tl.arange(0, 32)
+    bits = tl.load(B + pid_n * stride_bn + offs_m * stride_bm, mask=offs_m < shape_bm, other=0)
+    bits = tl.reshape(bits, [TILE_SIZE, BLOCK_M])
+    ret = vpopc(bits)  # [TILE_SIZE, 32]
+
+    offs_t = pid_m * TILE_SIZE + tl.arange(0, TILE_SIZE)
+
+    tl.atomic_add(Ret + offs_n, tl.sum(ret, 0), sem="relaxed")
+    tl.store(Partials + offs_t[:, None] * stride_pm + offs_n[None, :] * stride_pn, ret)
+
+
+def clear_sums(n_cols, device, MEMSET_BLOCK=512):
+    cdiv = triton.cdiv
+    blocks = cdiv(n_cols, MEMSET_BLOCK)
+    out_ret = torch.empty((blocks * MEMSET_BLOCK, ), device=device, dtype=torch.int32)
+    _sum_bitmatrix_memset[(blocks, )](out_ret, MEMSET_BLOCK)
+    return out_ret
+
+
+def sum_bitmatrix_rows(x, out_ret, partials_block_size=None):
     assert partials_block_size is not None
     cdiv = triton.cdiv
     PARTIALS_BLOCK_M = partials_block_size
-    BLOCK_N = 32
-    MEMSET_BLOCK = 512
     n_rows, n_cols = x.shape
     assert out_ret.shape == (n_cols, )
-    assert out_partials.shape == (cdiv(n_rows, PARTIALS_BLOCK_M), n_cols)
+
+    TILE_SIZE = 2
+    BLOCK_MM = PARTIALS_BLOCK_M * TILE_SIZE
+
+    pids_x = cdiv(n_rows, BLOCK_MM)
+    pids_y = cdiv(n_cols, 32)
+    out_partials = torch.empty((pids_y * 32, pids_x * TILE_SIZE), device=out_ret.device, dtype=torch.int32)
+    out_partials = torch.transpose(out_partials, 0, 1)
+
     # output tensors
-    _sum_bitmatrix_memset[(cdiv(out_ret.shape[0], MEMSET_BLOCK), )](
-        out_ret, out_ret.shape[0],  # outputs
-        BLOCK=512  # tunable parameter
-    )
-    _sum_bitmatrix_rows[(cdiv(n_rows, PARTIALS_BLOCK_M), cdiv(n_cols, BLOCK_N))](
-        x.data, x.data.shape[0], x.data.stride(0),  # input
+    _sum_bitmatrix_rows[(pids_x, pids_y)](
+        x.data, x.data.shape[0], x.data.stride(0), x.data.stride(1),  # input
         out_ret,  # output [final reduction]
-        out_partials, out_partials.stride(0), out_partials.shape[1],  # output [partial reductions]
-        BLOCK_N=BLOCK_N,  # tunable parameters
-        BLOCK_M=PARTIALS_BLOCK_M,  # constants
-    )
+        out_partials, out_partials.stride(0), out_partials.stride(1),
+        out_partials.shape[1],  # output [partial reductions]
+        BLOCK_M=PARTIALS_BLOCK_M, BLOCK_MM=BLOCK_MM,  # constants
+        num_warps=8)
+
+    out_partials = out_partials[:cdiv(n_rows, PARTIALS_BLOCK_M), :n_cols]
+
     return out_ret, out_partials
diff --git a/python/triton_kernels/triton_kernels/routing.py b/python/triton_kernels/triton_kernels/routing.py
@@ -55,7 +55,6 @@ def n_blocks(self, n_rows, block_m):
 
 def routing(logits, n_expts_act, expt_indx=None, simulated_ep=1):
     from .topk import topk
-    from .reduction import sum
     from .compaction import compaction
     assert expt_indx is None
     cdiv = triton.cdiv
@@ -72,6 +71,7 @@ def routing(logits, n_expts_act, expt_indx=None, simulated_ep=1):
         _routing_clear_bitmatrix[(n_tokens, )](
             bitmatrix.data,
             bitmatrix.data.stride(0),
+            bitmatrix.data.stride(1),
             bitmatrix.data.shape[1],
             n_expts_tot // simulated_ep,
             BLOCK_N=512,
@@ -80,10 +80,9 @@ def routing(logits, n_expts_act, expt_indx=None, simulated_ep=1):
         n_expts_tot = n_expts_tot // simulated_ep
         bitmatrix.shape[-1] = n_expts_tot
     # perform compaction to update expt_scal / expt_indx
-    hist, partial_hist = sum(bitmatrix, partials_block_size=HIST_BLOCK_M, dim=0)
+    hist, partial_hist = bitmatrix.sum(partials_block_size=HIST_BLOCK_M)
     # scratchpad
     expt_offs = torch.empty(n_expts_tot, dtype=torch.int32, device=device)
-    indx_offs = torch.empty((cdiv(n_tokens, HIST_BLOCK_M), n_expts_tot), dtype=torch.int32, device=device)
     combined_indx = torch.empty(n_gates * 2, dtype=torch.int32, device=device)
     # output
     topk_indx = combined_indx[:n_gates]
@@ -93,12 +92,14 @@ def routing(logits, n_expts_act, expt_indx=None, simulated_ep=1):
                                                                   expt_offs, hist.shape[0], BLOCK_N=512)
     _routing_compute_indx_offs[(n_expts_tot, )](
         expt_offs, partial_hist,  # inputs
-        indx_offs, partial_hist.shape[0], partial_hist.stride(0),  # outputs
+        partial_hist.shape[0], partial_hist.stride(0), partial_hist.stride(1),  # outputs
         BLOCK_M=INDX_OFFS_BLOCK_M,  # tunable parameters
     )
+    indx_offs = partial_hist
+
     _routing_compute_indx[(cdiv(n_tokens, HIST_BLOCK_M), )](
         topk_indx, gate_indx, gate_scal,  # outputs
-        expt_scal, expt_indx, indx_offs, indx_offs.stride(0), n_gates,  # input
+        expt_scal, expt_indx, indx_offs, indx_offs.stride(0), indx_offs.stride(1), n_gates,  # input
         BLOCK_M=HIST_BLOCK_M,  # tunable parameters
         N_EXPTS_ACT=n_expts_act,  # constants
         num_warps=1 if HIST_BLOCK_M * n_expts_act // 32 < 4 else 4)
diff --git a/python/triton_kernels/triton_kernels/routing_details/_routing_compute.py b/python/triton_kernels/triton_kernels/routing_details/_routing_compute.py
@@ -18,21 +18,19 @@ def _routing_compute_expt_offs(ExpertHist, FinalExpertOffs, hist_size,  # histog
 
 
 @triton.jit
-def _routing_compute_indx_offs(TokensStart, PartialHist, PartialOffs, shape_pm, stride_pm, BLOCK_M: tl.constexpr):
+def _routing_compute_indx_offs(TokensStart, PartialHist, shape_pm, stride_pm, stride_pn, BLOCK_M: tl.constexpr):
     expt_id = tl.program_id(0)
     offs_m = tl.arange(0, BLOCK_M)
     # initialize first row of the output
     start = tl.load(TokensStart + expt_id)
-    tl.store(PartialOffs + expt_id, start)
     # iterate over input data
     curr_sum = start
     for _ in range(0, shape_pm, BLOCK_M):
-        offs = offs_m * stride_pm + expt_id
+        offs = offs_m * stride_pm + expt_id * stride_pn
         curr = tl.load(PartialHist + offs, mask=offs_m < shape_pm)
         out = tl.cumsum(curr, 0) + curr_sum
         curr_sum += tl.sum(curr, 0)
-        offs = (1 + offs_m) * stride_pm + expt_id
-        tl.store(PartialOffs + offs, out, mask=offs_m < shape_pm - 1)
+        tl.store(PartialHist + offs, out - curr, mask=offs_m < shape_pm)
         offs_m += BLOCK_M
 
 
@@ -49,8 +47,8 @@ def _keyed_add(x, y):
 
 
 @triton.jit
-def _routing_compute_indx(GatherIndx, ScatterIndx, GateScal, ExptScal, ExptIndx, PartialOffs, stride_pm, n_gates,
-                          BLOCK_M: tl.constexpr, N_EXPTS_ACT: tl.constexpr):
+def _routing_compute_indx(GatherIndx, ScatterIndx, GateScal, ExptScal, ExptIndx, PartialOffs, stride_pm, stride_pn,
+                          n_gates, BLOCK_M: tl.constexpr, N_EXPTS_ACT: tl.constexpr):
 
     pid_m = tl.program_id(0)
 
@@ -73,7 +71,7 @@ def _routing_compute_indx(GatherIndx, ScatterIndx, GateScal, ExptScal, ExptIndx,
     expts_and_inclusive_run_lengths = tl.associative_scan(x, 0, _keyed_add)
     exclusive_run_lengths = (expts_and_inclusive_run_lengths - 1) & 0xffff
 
-    gates = tl.load(PartialOffs + pid_m * stride_pm + expert, mask=(expert != 0xffff))
+    gates = tl.load(PartialOffs + pid_m * stride_pm + expert * stride_pn, mask=(expert != 0xffff))
     gates += exclusive_run_lengths
 
     tl.store(ScatterIndx + offs, gates, mask=mask)
@@ -82,17 +80,17 @@ def _routing_compute_indx(GatherIndx, ScatterIndx, GateScal, ExptScal, ExptIndx,
 
 
 @triton.jit
-def _routing_clear_bitmatrix(Bitmatrix, stride_bm, shape_bn, cutoff, BLOCK_N: tl.constexpr):
+def _routing_clear_bitmatrix(Bitmatrix, stride_bm, stride_bn, shape_bn, cutoff, BLOCK_N: tl.constexpr):
     pid_m = tl.program_id(0)
     cutoff_word = cutoff // 32
     cutoff_bit = cutoff % 32
     cutoff_mask = (1 << (cutoff_bit)) - 1
     for start_n in range(0, shape_bn, BLOCK_N):
         offs_n = start_n + tl.arange(0, BLOCK_N)
-        values = tl.load(Bitmatrix + pid_m * stride_bm + offs_n, mask=offs_n < shape_bn)
+        values = tl.load(Bitmatrix + pid_m * stride_bm + offs_n * stride_bn, mask=offs_n < shape_bn)
         values = tl.where(offs_n == cutoff_word, values & cutoff_mask, values)
         values = tl.where(offs_n > cutoff_word, 0, values)
-        tl.store(Bitmatrix + pid_m * stride_bm + offs_n, values, mask=offs_n < shape_bn)
+        tl.store(Bitmatrix + pid_m * stride_bm + offs_n * stride_bn, values, mask=offs_n < shape_bn)
 
 
 @triton.jit
diff --git a/python/triton_kernels/triton_kernels/topk.py b/python/triton_kernels/triton_kernels/topk.py
@@ -1,12 +1,13 @@
 import torch
 from .topk_details._topk import _topk
-from triton_kernels import Bitmatrix
+from .bitmatrix import Bitmatrix
 
 
 def topk(x, k, dim=1, return_bitmatrix=True):
     cdiv = lambda a, b: (a + b - 1) // b
-    BLOCK_M = 8
-    BLOCK_N = 128
+    BLOCK_M = 32
+    BLOCK_N = 32
+    BLOCK_S = 128
     assert x.ndim == 2
     assert x.shape[-1] < 32768
     assert dim == 1
@@ -19,13 +20,23 @@ def topk(x, k, dim=1, return_bitmatrix=True):
     # NOTE: these are not returned
     y_vals = torch.empty((n_rows, k), dtype=x.dtype, device=dev)
     y_indx = torch.empty((n_rows, k), dtype=torch.int16, device=dev)
-    bitmatrix = torch.empty((n_rows, n_cols_words), dtype=torch.uint32, device=dev)
-    _topk[(cdiv(n_rows, BLOCK_M), )](
+
+    # create bitmatrix in transposed memory layout:
+    bitmatrix = torch.empty((n_cols_words, cdiv(n_rows, 32) * 32), dtype=torch.uint32, device=dev)
+    bitmatrix = torch.transpose(bitmatrix, 0, 1)[:n_rows]
+    s_blocks = cdiv(n_cols, BLOCK_S)
+    s_cols = s_blocks * BLOCK_S
+    S = torch.empty((s_cols, ), dtype=torch.int32, device=dev)
+
+    pids = max(cdiv(n_rows, BLOCK_M), s_blocks)
+
+    _topk[(pids, )](
         x, x.stride(0),  # inputs
         y_vals, y_indx, y_vals.stride(0),  # output [topk]
-        bitmatrix, bitmatrix.stride(0),  # output [bitmatrix]
+        bitmatrix, bitmatrix.stride(0), bitmatrix.stride(1),  # output [bitmatrix]
         n_rows, n_cols,  # shapes
+        S, BLOCK_S, s_blocks,  # thing to memset to zero
         BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N,  # tunable parameter
         N_EXPTS_PAD=n_cols_pad, N_EXPTS_ACT=k,  # constants
     )
-    return y_vals, y_indx, Bitmatrix(bitmatrix, [n_rows, n_cols])
+    return y_vals, y_indx, Bitmatrix(bitmatrix, [n_rows, n_cols], S)
diff --git a/python/triton_kernels/triton_kernels/topk_details/_topk.py b/python/triton_kernels/triton_kernels/topk_details/_topk.py
@@ -40,9 +40,18 @@ def streaming_topk(X, stride_xm, n_expts_tot, offs_m, mask_m, N_EXPTS_PAD: tl.co
 @triton.jit
 def _topk(X, stride_xm,  # inputs
           Yv, Yi, stride_ym,  # topk values/indices
-          Bits, stride_rm, n_rows,  # bitmatrix
-          n_expts_tot, BLOCK_M: tl.constexpr, N_EXPTS_PAD: tl.constexpr, N_EXPTS_ACT: tl.constexpr,
-          BLOCK_N: tl.constexpr):
+          Bits, stride_rm: tl.constexpr, stride_rn: tl.constexpr, n_rows,  # bitmatrix
+          n_expts_tot, S, BLOCK_S: tl.constexpr, s_blocks,  # thing to memset
+          BLOCK_M: tl.constexpr, N_EXPTS_PAD: tl.constexpr, N_EXPTS_ACT: tl.constexpr, BLOCK_N: tl.constexpr):
+
+    pid = tl.program_id(0)
+
+    if pid < s_blocks:
+        tl.store(S + BLOCK_S * pid + tl.arange(0, BLOCK_S), tl.zeros([BLOCK_S], tl.int32))
+
+    if pid * BLOCK_M >= n_rows:
+        # early exit:
+        return
 
     tl.static_assert(BLOCK_N % 32 == 0)
     tl.static_assert(N_EXPTS_PAD % BLOCK_N == 0)
@@ -52,7 +61,7 @@ def _topk(X, stride_xm,  # inputs
     x_ultype: tl.constexpr = tl.dtype(f"uint{2*x_nbits}")
 
     # load logits
-    offs_m = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_m = pid * BLOCK_M + tl.arange(0, BLOCK_M)
     mask_m = offs_m[:, None] < n_rows
     y = streaming_topk(X, stride_xm, n_expts_tot, offs_m, mask_m, N_EXPTS_PAD, N_EXPTS_ACT, BLOCK_N)
     y = y.to(x_ultype, bitcast=True)
@@ -79,5 +88,5 @@ def _topk(X, stride_xm,  # inputs
         offs_r_n = tl.arange(0, BLOCK_N // 32) + i * (BLOCK_N // 32)
         y2 = tl.where(y_div[:, :, None] == offs_r_n[None, None, :], (1 << y_rem)[:, :, None], 0)
         r = tl.reduce_or(y2, axis=1)
-        BitsPtrs = Bits + offs_m[:, None] * stride_rm + offs_r_n[None, :]
+        BitsPtrs = Bits + offs_m[:, None] * stride_rm + offs_r_n[None, :] * stride_rn
         tl.store(BitsPtrs, r, mask=mask_m)