[BENCH][KERNELS] Extract common code from bench_mlp.py and distributed.py into bench_utils.py (#8866)

Jokeren · web-flow · commit 5d1789740ec3 · 2025-12-05T19:45:44.000-05:00
Previously, updates to `triton_kernels` only triggered tests in
`distributed.py`, leaving `bench_mlp.py` untested and causing mismatches
. This change ensures shared data structures or configs only appear once
so updates to `triton_kernels` are consistently validated.

Also fixed a few typing issues
diff --git a/python/triton_kernels/bench/bench_mlp.py b/python/triton_kernels/bench/bench_mlp.py
@@ -1,17 +1,13 @@
 from itertools import chain
 from pathlib import Path
-from copy import deepcopy
 import triton.profiler as proton
 import torch
 import argparse
-import triton_kernels
 import triton_kernels.roofline as roofline
-import triton_kernels.swiglu
-from triton_kernels.matmul import matmul, PrecisionConfig, FlexCtx, FnSpecs, FusedActivation
-from triton_kernels.target_info import get_cdna_version, cuda_capability_geq
+from triton_kernels.matmul import matmul
+from triton_kernels.target_info import get_cdna_version
 import distributed as triton_dist
-from triton_kernels.tensor_details import layout
-from bench_utils import quantize_weight
+from bench_utils import prepare_mlp_numerics, resolve_x_dtype
 import tempfile
 
 
@@ -40,35 +36,12 @@ def bench_mlp(batch_per_expt, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_d
     b2 = triton_dist.broadcast(b2, src=ep_indx * TP, groups=groups, group_idx=ep_indx)
 
     # -- numerics --
-    opt1 = dict()
-    opt2 = dict()
-    if w_dtype == "mx4":
-        # on hopper we only use 8 warps when weight is scaled
-        num_warps = 4 if batch <= 512 and cuda_capability_geq(10, 0) else 8
-        value_layout, value_layout_opts = layout.make_default_matmul_mxfp4_w_layout(mx_axis=1)
-        scale_layout, scale_layout_opts = layout.make_default_matmul_mxfp4_w_scale_layout(
-            mx_axis=1, num_warps=num_warps)
-        opt1 = {
-            "value_layout": value_layout,
-            "value_layout_opts": value_layout_opts,
-            "scale_layout": scale_layout,
-            "scale_layout_opts": scale_layout_opts,
-        }
-        opt2 = deepcopy(opt1)
-    wg, wg_flex, wg_scale = quantize_weight(wg, "bf16")
-    w1, w1_flex, w1_scale = quantize_weight(w1, w_dtype, **opt1)
-    w2, w2_flex, w2_scale = quantize_weight(w2, w_dtype, **opt2)
-    pcg = PrecisionConfig(flex_ctx=FlexCtx(rhs_data=wg_flex), b_mx_scale=wg_scale)
-    act = FusedActivation(FnSpecs("swiglu", triton_kernels.swiglu.swiglu_fn, ("alpha", "limit"), reduction_n=2),
-                          (1.0, 1.0))
-    pc1 = PrecisionConfig(flex_ctx=FlexCtx(rhs_data=w1_flex), b_mx_scale=w1_scale)
-    pc2 = PrecisionConfig(flex_ctx=FlexCtx(rhs_data=w2_flex), b_mx_scale=w2_scale)
+    numerics = prepare_mlp_numerics(batch, w_dtype, wg, w1, w2)
+    wg, w1, w2 = numerics.wg, numerics.w1, numerics.w2
+    pcg, pc1, pc2, act = numerics.pcg, numerics.pc1, numerics.pc2, numerics.activation
 
     # -- benchmark --
-    x_dtype = {"fp16": torch.float16, "bf16": torch.bfloat16, "fp8": torch.float8_e4m3fn}[x_dtype]
-    # special treatment of fp8_e4m3 on AMD CDNA3 because it uses fp8_e4m3fnuz
-    if x_dtype == torch.float8_e4m3fn and get_cdna_version() == 3:
-        x_dtype = torch.float8_e4m3fnuz
+    x_dtype = resolve_x_dtype(x_dtype)
 
     input_x = torch.randn((batch // DP, dim1), device=dev)
     expt_assignment = triton_dist.create_expt_assignment(EP, n_expts_tot, torch.device(dev))
diff --git a/python/triton_kernels/bench/bench_utils.py b/python/triton_kernels/bench/bench_utils.py
@@ -1,12 +1,18 @@
+from copy import deepcopy
+from dataclasses import dataclass
+
+import triton_kernels
+import triton_kernels.swiglu
+from triton_kernels.matmul import PrecisionConfig, FlexCtx, FnSpecs, FusedActivation
 from triton_kernels.numerics import InFlexData
 from triton_kernels.numerics_details.mxfp import downcast_to_mxfp
-from triton_kernels.tensor import convert_layout
-from triton_kernels.tensor import wrap_torch_tensor, FP4
-from triton_kernels.target_info import is_cuda, get_cdna_version, cuda_capability_geq
+from triton_kernels.tensor import convert_layout, wrap_torch_tensor, FP4, Tensor
+from triton_kernels.target_info import is_cuda, get_cdna_version, cuda_capability_geq, is_hip
+from triton_kernels.tensor_details import layout
 import torch
 
 
-def quantize_weight(w, dtype, **opt):
+def _quantize_weight(w, dtype, **opt):
     if dtype == "bf16":
         wq = w.to(torch.bfloat16).transpose(-1, -2).contiguous().transpose(-1, -2)
         return wq, InFlexData(), None
@@ -23,3 +29,60 @@ def quantize_weight(w, dtype, **opt):
             w = convert_layout(wrap_torch_tensor(w, dtype=FP4), opt["value_layout"], **opt["value_layout_opts"])
             w_scale = convert_layout(wrap_torch_tensor(w_scale), opt["scale_layout"], **opt["scale_layout_opts"])
         return w, InFlexData(), w_scale
+
+
+@dataclass
+class MlpNumerics:
+    wg: torch.Tensor | Tensor | None
+    w1: torch.Tensor | Tensor | None
+    w2: torch.Tensor | Tensor | None
+    pcg: PrecisionConfig
+    pc1: PrecisionConfig
+    pc2: PrecisionConfig
+    activation: FusedActivation
+
+
+def _make_default_mlp_activation() -> FusedActivation:
+    return FusedActivation(
+        FnSpecs("swiglu", triton_kernels.swiglu.swiglu_fn, ("alpha", "limit"), reduction_n=2),
+        (1.0, 1.0),
+    )
+
+
+def _make_mx4_quantization_opts(batch: int, w_dtype: str) -> dict:
+    if w_dtype != "mx4" or is_hip():
+        return {}
+    num_warps = 4 if batch <= 512 and cuda_capability_geq(10, 0) else 8
+    value_layout, value_layout_opts = layout.make_default_matmul_mxfp4_w_layout(mx_axis=1)
+    scale_layout, scale_layout_opts = layout.make_default_matmul_mxfp4_w_scale_layout(mx_axis=1, num_warps=num_warps)
+    return {
+        "value_layout": value_layout,
+        "value_layout_opts": value_layout_opts,
+        "scale_layout": scale_layout,
+        "scale_layout_opts": scale_layout_opts,
+    }
+
+
+def prepare_mlp_numerics(batch: int, w_dtype: str, wg, w1, w2) -> MlpNumerics:
+    quantization_opts = _make_mx4_quantization_opts(batch, w_dtype)
+    wg, wg_flex, wg_scale = _quantize_weight(wg, "bf16")
+    w1, w1_flex, w1_scale = _quantize_weight(w1, w_dtype, **deepcopy(quantization_opts))
+    w2, w2_flex, w2_scale = _quantize_weight(w2, w_dtype, **deepcopy(quantization_opts))
+    activation = _make_default_mlp_activation()
+    return MlpNumerics(
+        wg=wg,
+        w1=w1,
+        w2=w2,
+        pcg=PrecisionConfig(flex_ctx=FlexCtx(rhs_data=wg_flex), b_mx_scale=wg_scale),
+        pc1=PrecisionConfig(flex_ctx=FlexCtx(rhs_data=w1_flex), b_mx_scale=w1_scale),
+        pc2=PrecisionConfig(flex_ctx=FlexCtx(rhs_data=w2_flex), b_mx_scale=w2_scale),
+        activation=activation,
+    )
+
+
+def resolve_x_dtype(x_dtype: str) -> torch.dtype:
+    dtype_map = {"fp16": torch.float16, "bf16": torch.bfloat16, "fp8": torch.float8_e4m3fn}
+    dtype = dtype_map[x_dtype]
+    if dtype == torch.float8_e4m3fn and get_cdna_version() == 3:
+        return torch.float8_e4m3fnuz
+    return dtype
diff --git a/python/triton_kernels/bench/distributed.py b/python/triton_kernels/bench/distributed.py
@@ -3,21 +3,17 @@
 import torch
 import torch.distributed as dist
 import torch.multiprocessing as mp
-from copy import deepcopy
 from dataclasses import dataclass
 from typing import Tuple, Optional
 
-import triton_kernels
-import triton_kernels.swiglu
 from triton_kernels.reduce import reduce
 from triton_kernels.topk import topk
-from triton_kernels.matmul import matmul, PrecisionConfig, FlexCtx, FnSpecs, FusedActivation
+from triton_kernels.matmul import matmul
 from triton_kernels.target_info import get_cdna_version, is_hip, is_cuda, cuda_capability_geq
-from triton_kernels.tensor_details import layout
 from triton_kernels.tensor import RaggedTensorMetadata, make_ragged_tensor_metadata, remap_ragged_tensor_metadata
 from triton_kernels.distributed import make_expt_dict_uniform, make_expt_assignment, convert_dp_to_ep, convert_ep_to_dp, ExptAssignment, symm_mem_pool
 
-from bench_utils import quantize_weight
+from bench_utils import prepare_mlp_numerics, resolve_x_dtype
 
 
 @dataclass
@@ -250,50 +246,20 @@ def distributed_run(rank, world_size, batch, dim1, dim2, n_expts_tot, n_expts_ac
     b1_full = gather_full(rank, world_size, b1, TP, EP, concat_dim_inside=1, concat_dim_outside=0)
     b2_full = gather_ep(rank, world_size, b2, TP, EP)
 
-    # quantization
-    opt1 = dict()
-    opt2 = dict()
-    if w_dtype == "mx4" and not is_hip():
-        # on hopper we only use 8 warps when weight is scaled
-        num_warps = 4 if batch <= 512 and cuda_capability_geq(10, 0) else 8
-        value_layout, value_layout_opts = layout.make_default_matmul_mxfp4_w_layout(mx_axis=1)
-        scale_layout, scale_layout_opts = layout.make_default_matmul_mxfp4_w_scale_layout(
-            mx_axis=1, num_warps=num_warps)
-        opt1 = {
-            "value_layout": value_layout,
-            "value_layout_opts": value_layout_opts,
-            "scale_layout": scale_layout,
-            "scale_layout_opts": scale_layout_opts,
-        }
-        opt2 = deepcopy(opt1)
-    wg, wg_flex, wg_scale = quantize_weight(wg, "bf16")
-    w1, w1_flex, w1_scale = quantize_weight(w1, w_dtype, **opt1)
-    w2, w2_flex, w2_scale = quantize_weight(w2, w_dtype, **opt2)
+    wg_unquantized = wg
+    numerics = prepare_mlp_numerics(batch, w_dtype, wg_unquantized, w1, w2)
+    wg, w1, w2 = numerics.wg, numerics.w1, numerics.w2
+    pcg, pc1, pc2, act = numerics.pcg, numerics.pc1, numerics.pc2, numerics.activation
     if rank == 0:
-        w1_full, w1_flex_full, w1_scale_full = quantize_weight(w1_full, w_dtype, **opt1)
-        w2_full, w2_flex_full, w2_scale_full = quantize_weight(w2_full, w_dtype, **opt2)
-    else:
-        w1_full = w2_full = w1_flex_full = w2_flex_full = w1_scale_full = w2_scale_full = None
-
-    # precision configs
-    pcg = PrecisionConfig(flex_ctx=FlexCtx(rhs_data=wg_flex), b_mx_scale=wg_scale)
-    act = FusedActivation(FnSpecs("swiglu", triton_kernels.swiglu.swiglu_fn, ("alpha", "limit"), reduction_n=2),
-                          (1.0, 1.0))
-    pc1 = PrecisionConfig(flex_ctx=FlexCtx(rhs_data=w1_flex), b_mx_scale=w1_scale)
-    pc2 = PrecisionConfig(flex_ctx=FlexCtx(rhs_data=w2_flex), b_mx_scale=w2_scale)
-    if rank == 0:
-        pc1_full = PrecisionConfig(flex_ctx=FlexCtx(rhs_data=w1_flex_full), b_mx_scale=w1_scale_full)
-        pc2_full = PrecisionConfig(flex_ctx=FlexCtx(rhs_data=w2_flex_full), b_mx_scale=w2_scale_full)
+        full_numerics = prepare_mlp_numerics(batch, w_dtype, wg_unquantized, w1_full, w2_full)
+        w1_full, w2_full = full_numerics.w1, full_numerics.w2
+        pc1_full, pc2_full = full_numerics.pc1, full_numerics.pc2
     else:
         pc1_full = pc2_full = None
 
     # inputs
-    dtype_map = {
-        "fp16": torch.float16,
-        "bf16": torch.bfloat16,
-        "fp8": torch.float8_e4m3fnuz if get_cdna_version() == 3 else torch.float8_e4m3fn,
-    }
-    xd = torch.randn((batch // world_size, dim1), device=dev).to(dtype_map[x_dtype])
+    input_dtype = resolve_x_dtype(x_dtype)
+    xd = torch.randn((batch // world_size, dim1), device=dev).to(input_dtype)
     x0 = all_gather(xd, dim=0)
     expt_assignment = create_expt_assignment(EP, n_expts_tot, torch.device(dev))
     symm_mem_pool.initialize_matmul(
diff --git a/python/triton_kernels/triton_kernels/matmul.py b/python/triton_kernels/triton_kernels/matmul.py
@@ -6,6 +6,7 @@
 import triton
 from enum import Enum, auto
 import math
+from typing import Callable
 # utilities
 from triton_kernels import target_info
 from triton_kernels.numerics import InFlexData, OutFlexData
@@ -26,15 +27,15 @@
 @dataclass(frozen=True)
 class FusedActivation:
     specs: FnSpecs = FnSpecs.default()
-    fn_args: tuple[object] = tuple()
+    fn_args: tuple[object, ...] = tuple()
 
 
 @dataclass(frozen=True)
 class Epilogue:
     specs: FnSpecs = FnSpecs.default()
-    fn_arg_values_matmul: tuple[object] = tuple()
-    fn_arg_values_finalize: tuple[object] = tuple()
-    effective_itemsize: float = None
+    fn_arg_values_matmul: tuple[object, ...] = tuple()
+    fn_arg_values_finalize: tuple[object, ...] = tuple()
+    effective_itemsize: float | None = None
 
 class FnName(Enum):
     QUANTIZE_MXFP8 = auto()
@@ -86,16 +87,16 @@ class FlexCtx:
 
 @dataclass
 class PrecisionConfig:
-    max_num_imprecise_acc: int = None
+    max_num_imprecise_acc: int | None = None
     allow_tf32: bool = True
     flex_ctx: FlexCtx = FlexCtx()
-    acc_scale: int = 1.0
+    acc_scale: float = 1.0
     flexpoint_saturate_inf: bool = False
-    report_quantization_err_fn: callable = None
-    a_mx_scale: Tensor | None = None
-    b_mx_scale: Tensor| None = None
-    c_mx_scale: Tensor | None = None
-    out_dtype: torch.dtype = None
+    report_quantization_err_fn: Callable | None = None
+    a_mx_scale: torch.Tensor | Tensor | None = None
+    b_mx_scale: torch.Tensor | Tensor | None = None
+    c_mx_scale: torch.Tensor | Tensor | None = None
+    out_dtype: torch.dtype | None = None
     enforce_bitwise_invariance: bool = False
 
 
diff --git a/python/triton_kernels/triton_kernels/specialize.py b/python/triton_kernels/triton_kernels/specialize.py
@@ -1,4 +1,5 @@
 from dataclasses import dataclass
+from typing import Optional
 import inspect
 import re
 import textwrap
@@ -66,9 +67,9 @@ def _empty_fn():
 @dataclass(frozen=True)
 class FnSpecs:
     name: str
-    fn: "triton.runtime.jit.JITFunction"
-    fn_arg_names: tuple[str]
-    fn_arg_do_not_specialize: tuple[str] = tuple()
+    fn: Optional["triton.runtime.jit.JITFunction"]
+    fn_arg_names: tuple[str, ...] = tuple()
+    fn_arg_do_not_specialize: tuple[str, ...] = tuple()
     reduction_n: int = 1
 
     @staticmethod