[KERNELS] improved twiddling/swizzling for H100 simulated mxfp4 (h/t @lezcano) (#7587)

ptillet · web-flow · commit 981b0bb0c66b · 2025-07-21T17:01:44.000-07:00
diff --git a/python/triton_kernels/bench/bench_mlp.py b/python/triton_kernels/bench/bench_mlp.py
@@ -10,11 +10,11 @@
 from triton_kernels.matmul_ogs import matmul_ogs, PrecisionConfig, FlexCtx, FnSpecs, FusedActivation
 from triton_kernels.numerics import InFlexData
 from triton_kernels.routing import routing
-from triton_kernels.target_info import is_hip, get_cdna_version, is_cuda
+from triton_kernels.target_info import is_hip, get_cdna_version
 from triton_kernels.tensor import convert_layout
-from triton_kernels.tensor_details.layout import StridedLayout, BlackwellMXScaleLayout, HopperMXScaleLayout, HopperMXValueLayout
 from triton_kernels.tensor import wrap_torch_tensor, FP4
 from dataclasses import dataclass
+from triton_kernels.tensor_details import layout
 
 if torch.cuda.is_available() and not is_hip():
     from triton._C.libtriton import nvidia
@@ -36,8 +36,8 @@ def quantize(w, dtype, dev, **opt):
     else:
         assert dtype == "mx4", f"{dtype=}"
         w, w_scale = downcast_to_mxfp(w.to(torch.bfloat16), torch.uint8, axis=1)
-        w = convert_layout(wrap_torch_tensor(w, dtype=FP4), opt["value_layout"])
-        w_scale = convert_layout(wrap_torch_tensor(w_scale), opt["scale_layout"])
+        w = convert_layout(wrap_torch_tensor(w, dtype=FP4), opt["value_layout"], **opt["value_layout_opts"])
+        w_scale = convert_layout(wrap_torch_tensor(w_scale), opt["scale_layout"], **opt["scale_layout_opts"])
         return w, InFlexData(), w_scale
 
 
@@ -101,16 +101,13 @@ def bench_mlp(batch, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_dtype, TP,
     optg = dict()
     opt1 = dict()
     opt2 = dict()
-    if w_dtype == "mx4":
-        value_layout = StridedLayout
-        scale_layout = StridedLayout
-        if is_cuda():
-            if torch.cuda.get_device_capability()[0] == 9:
-                value_layout = HopperMXValueLayout
-                scale_layout = HopperMXScaleLayout
-            if torch.cuda.get_device_capability()[0] == 10:
-                scale_layout = BlackwellMXScaleLayout
-        opt1 = {"value_layout": value_layout, "scale_layout": scale_layout}
+    if w_dtype == "mx4" and not is_hip():
+        num_warps = 4 if batch <= 512 else 8
+        value_layout, value_layout_opts = layout.make_default_matmul_mxfp4_w_layout(mx_axis=1)
+        scale_layout, scale_layout_opts = layout.make_default_matmul_mxfp4_w_scale_layout(
+            mx_axis=1, num_warps=num_warps)
+        opt1 = {"value_layout": value_layout, "value_layout_opts": value_layout_opts, \
+                "scale_layout": scale_layout, "scale_layout_opts": scale_layout_opts}
         opt2 = deepcopy(opt1)
     wg, wg_flex, wg_scale = quantize(wg, "bf16", dev, **optg)
     w1, w1_flex, w1_scale = quantize(w1, w_dtype, dev, **opt1)
diff --git a/python/triton_kernels/tests/test_matmul.py b/python/triton_kernels/tests/test_matmul.py
@@ -328,25 +328,24 @@ def test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, has_y_gammas
         w_ref = w_ref.squeeze(0).detach().requires_grad_(test_bwd)
 
     if is_mixed_input:
-        capability_major = torch.cuda.get_device_capability()[0]
-        w_layout = layout.StridedLayout
-        w_scale_layout = layout.StridedLayout
+        mx_axis = w_tri.ndim - 2
+        # compute layouts
+        w_layout, w_layout_opts = layout.StridedLayout, dict()
+        w_scale_layout, w_scale_layout_opts = layout.StridedLayout, dict()
         if hbm_swizzling and "float4" in weight_dtype_str:
-            # weight layout
-            w_layouts = {9: layout.HopperMXValueLayout}
-            w_layout = w_layouts.get(capability_major, layout.StridedLayout)
-            # weight scale layout
-            w_scales_layouts = {9: layout.HopperMXScaleLayout, 10: layout.BlackwellMXScaleLayout}
-            w_scale_layout = w_scales_layouts.get(capability_major, layout.StridedLayout)
-        w_tri, mx_scales_tri = downcast_to_mxfp(w_tri, weight_dtype, axis=-2)
-        w_ref = upcast_from_mxfp(w_tri, mx_scales_tri, torch.bfloat16, axis=-2)
+            w_layout, w_layout_opts = layout.make_default_matmul_mxfp4_w_layout(mx_axis=mx_axis)
+            w_scale_layout, w_scale_layout_opts = layout.make_default_matmul_mxfp4_w_scale_layout(
+                mx_axis=mx_axis, num_warps=8)
+        # downcast to mxfp
+        w_tri, w_scale_tri = downcast_to_mxfp(w_tri, weight_dtype, axis=mx_axis)
+        w_ref = upcast_from_mxfp(w_tri, w_scale_tri, torch.bfloat16, axis=mx_axis)
         w_tri_dtype = FP4 if "float4" in weight_dtype_str else weight_dtype
-        w_tri = convert_layout(wrap_torch_tensor(w_tri, w_tri_dtype), w_layout)
-        mx_scales_tri = convert_layout(wrap_torch_tensor(mx_scales_tri), w_scale_layout)
-        precision_opt.weight_scale = mx_scales_tri
-
-    # if not is_persistent and precision_opt.weight_scale is not None:
-    #     pytest.skip("non-persistent not supported with mxfp")
+        w_tri = wrap_torch_tensor(w_tri, w_tri_dtype)
+        w_scale_tri = wrap_torch_tensor(w_scale_tri)
+        # convert layouts
+        w_tri = convert_layout(w_tri, w_layout, **w_layout_opts)
+        w_scale_tri = convert_layout(w_scale_tri, w_scale_layout, **w_scale_layout_opts)
+        precision_opt.weight_scale = w_scale_tri
 
     if test_launch_metadata:
 
diff --git a/python/triton_kernels/tests/test_tensor.py b/python/triton_kernels/tests/test_tensor.py
@@ -1,60 +1 @@
-import torch
-import pytest
-import math
-from triton_kernels.testing import assert_equal
-from triton_kernels.tensor_details.layout import BlackwellMXScaleLayout, HopperMXScaleLayout, HopperMXValueLayout
-
-
-@pytest.mark.parametrize(
-    "shape",
-    [
-        (3, 4096, 1024),
-        (10, 254, 60),
-        (1, 320, 160),
-        (2, 16, 512),
-        (3, 2, 36),
-    ],
-)
-def test_mxfp_swizzle(shape: tuple[int, ...]):
-    """
-    Test that unswizzle is the inverse of swizzle, after removing padding.
-    """
-    x = torch.randn(shape, device="cuda")
-    layout = BlackwellMXScaleLayout(shape)
-    assert_equal(x, layout.unswizzle_data(layout.swizzle_data(x)))
-
-
-@pytest.mark.parametrize("shape", [(16, 32), (16, 64), (32, 32), (32, 64), (64, 128), (128, 128)])
-@pytest.mark.parametrize("trans", [False, True])
-@pytest.mark.parametrize("op_idx", [0, 1])
-@pytest.mark.parametrize("mma_version", [2, 3])
-def test_swizzle_mxfp4_value(shape, trans, op_idx, mma_version):
-    x = torch.randint(0, 256, shape, dtype=torch.uint8, device="cuda")
-    if trans:
-        x = x.mT
-    k_dim = 1 - op_idx
-    if x.shape[k_dim] < 32:
-        pytest.skip("Not enough elements along K")
-    layout = HopperMXValueLayout(x.shape, op_idx, mma_version)
-    res = layout.unswizzle_data(layout.swizzle_data(x))
-    assert (res == x).all()
-
-
-@pytest.mark.parametrize("num_warps", [4, 8])
-@pytest.mark.parametrize("shape", [(256, 64), (256, 128), (256, 256)])
-def test_swizzle_mxfp4_scale(shape, num_warps):
-    x = torch.randint(0, 256, shape, dtype=torch.uint8, device="cuda")
-    layout = HopperMXScaleLayout(x.shape, num_warps=num_warps)
-    res = layout.unswizzle_data(layout.swizzle_data(x))
-    assert (res[:shape[0], :shape[1]] == x).all()
-
-
-def test_unswizzle_mxfp4_value_golden_value():
-    shape = (16, 32)
-    x = torch.arange(math.prod(shape)).view(shape).to(torch.uint8)
-    layout = HopperMXValueLayout(x.shape, op_idx=1, mma_version=3)
-    res = layout.swizzle_data(x)
-    # Thread 0
-    assert res[0, 0:16].tolist() == [0, 0, 4, 4, 8, 8, 12, 12, 16, 16, 20, 20, 24, 24, 28, 28]
-    # Thread 1
-    assert res[0, 16:32].tolist() == [1, 1, 5, 5, 9, 9, 13, 13, 17, 17, 21, 21, 25, 25, 29, 29]
+# TODO: add tests for non-layout parts of tensor class
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs.py
@@ -482,6 +482,8 @@ def matmul_ogs(x, w, bias,
     w_scale_strides = w_scale.stride() if has_mx and not w_scale_has_tma else (None, None, None)
     if len(w_scale_strides) == 2:
         w_scale_strides = (0, ) + w_scale_strides
+    # if routing_data.expt_hist is not None:
+    #     print(opt_flags)
     # launch kernel
     kernels = get_kernels(epilogue.specs, fused_activation.specs)
     (kernels._p_matmul_ogs if opt_flags.is_persistent else kernels._matmul_ogs)[(grid,)](
@@ -532,8 +534,8 @@ def matmul_ogs(x, w, bias,
                    **opt_flags.target_kernel_kwargs)
     # post-processing
     out = apply_postprocessing_features(scatter_indx, finalize_scatter_idxs, opt_flags, expt_token_offs_raw,
-                                num_indx, precision_config, routing_data,
-                                postprocessing_features, memory, fused_postprocess_activation, epilogue)
+                                        num_indx, precision_config, routing_data,
+                                        postprocessing_features, memory, fused_postprocess_activation, epilogue)
     # remove split-k
     out = out.squeeze(0)
     if not is_input_batched:
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/_common.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/_common.py
@@ -84,7 +84,7 @@ def convert_dtype(dtype):
         # suffix = "" if not mode else "_o" + (''.join(mode))
         # if base_name.startswith("_p"):
         #     suffix += "_ptma"
-        return f"{base_name}_{layouts}_{dtypes}_{blocks}"
+        return f"cutlass_{base_name}_{layouts}_{dtypes}_{blocks}"
 
     return matmul_repr
 
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/_matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/_matmul_ogs.py
@@ -2,7 +2,7 @@
 import triton.language as tl
 from triton_kernels.tensor_details.layout_details.blackwell_scale import unswizzle_mx_scale_bw
 from triton_kernels.tensor_details.layout_details.hopper_scale import unswizzle_mxfp4_scale_hopper
-from triton_kernels.tensor_details.layout_details.hopper_value import unswizzle_mxfp4_value_hopper
+from triton_kernels.tensor_details.layout_details.hopper_value import mxfp4_to_bf16_triton
 from triton_kernels.numerics_details.flexpoint import float_to_flex, load_scale
 from triton_kernels.numerics_details.mxfp_details._downcast_to_mxfp import MXFP_BLOCK_SIZE
 from ._common import make_matmul_repr, matmul_launch_metadata, swizzle2d, xcd_swizzle, get_scaled_dot_format_string
@@ -250,18 +250,24 @@ def _matmul_ogs(
                 w_scales = unswizzle_mx_scale_bw(tl.load(MxScalePtrs))
             elif SWIZZLE_MX_SCALE == "HOPPER_SCALE":
                 # Handshake with the swizzling code
-                tl.static_assert(tl.extra.cuda.num_warps() == 8, "Only 8 warps are supported for Hopper swizzling. Got %d" % tl.extra.cuda.num_warps())
-                w_scales = unswizzle_mxfp4_scale_hopper(tl.load(MxScalePtrs), num_warps=8)
+                num_warps: tl.constexpr = tl.extra.cuda.num_warps()
+                w_scales = unswizzle_mxfp4_scale_hopper(tl.load(MxScalePtrs), mx_axis=1, num_warps=num_warps)
             else:
                 w_scales = tl.load(MxScalePtrs, mask=mask_k_scale[None, :], other=0.0)
 
             if SWIZZLE_MX_VALUE == "HOPPER_VALUE":
                 # Handshake with the swizzling code
-                w = unswizzle_mxfp4_value_hopper(w, op_idx=1, mma_version=3)
-                mma_version: tl.constexpr = 3 if w.shape[1] >= 64 else 2
-                tl.static_assert(mma_version == 3, "Only mma_version 3 is supported for Hopper swizzling")
-
-            acc = tl.dot_scaled(x, x_scales, x_format, w, w_scales, mx_format, acc=acc, fast_math=True)
+                tl.static_assert(x_format == "bf16")
+                tl.static_assert(mx_format == "e2m1")
+                w = mxfp4_to_bf16_triton(w.trans(), w_scales, 1)
+                tl.static_assert(w.dtype == tl.bfloat16)
+                acc = acc.trans()
+                x = x.trans()
+                # w = w.trans()
+                acc = tl.dot(w, x, acc, max_num_imprecise_acc=MAX_NUM_IMPRECISE_ACC, allow_tf32=ALLOW_TF32)
+                acc = acc.trans()
+            else:
+                acc = tl.dot_scaled(x, x_scales, x_format, w, w_scales, mx_format, acc=acc, fast_math=True)
             if SWIZZLE_MX_SCALE == "BLACKWELL_SCALE":
                 MxScalePtrs += (MX_SCALE_BLOCK_K // 4 * SPLIT_K) * stride_mx_k
             else:
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags.py
@@ -3,7 +3,6 @@
 from triton_kernels.target_info import get_cdna_version
 import torch
 from .opt_flags_details import opt_flags_amd, opt_flags_nvidia
-from ..tensor import get_layout
 
 # fmt: off
 
@@ -157,12 +156,10 @@ def make_default_opt_flags_nvidia(
     elif enforce_bitwise_invariance:
         block_m = 128
     else:
-        block_m = max(64, min(triton.next_power_of_2(tokens_per_expt), 128))
+        block_m = max(16, min(triton.next_power_of_2(tokens_per_expt), 128))
     # block n
     arch = None
     block_n = opt_flags_nvidia.compute_block_n(n, arch, precision_config)
-    if precision_config.weight_scale is not None and get_layout(precision_config.weight_scale).name == "HOPPER_SCALE":
-        block_n = 256
     # is_persistent
     grid_size = opt_flags_nvidia.compute_grid_size(routing_data, m, n, block_m, block_n)
     n_sms = torch.cuda.get_device_properties(0).multi_processor_count
@@ -177,7 +174,7 @@ def make_default_opt_flags_nvidia(
     if constraints.get("block_k", None) is not None:
         block_k = constraints["block_k"]
     else:
-        block_k = opt_flags_nvidia.compute_block_k(k, is_persistent, lhs_dtype, rhs_dtype, precision_config)
+        block_k = opt_flags_nvidia.compute_block_k(m, k, is_persistent, lhs_dtype, rhs_dtype, precision_config)
     # split_k
     if constraints.get("split_k", None) is not None:
         split_k = constraints["split_k"]
@@ -219,8 +216,7 @@ def make_default_opt_flags_nvidia(
     else:
         fused_scatter = can_use_fused_scatter and split_k == 1
     # Handshake with the HBM swizzling
-    hopper_swizzling = precision_config.weight_scale is not None and get_layout(precision_config.weight_scale).name == "HOPPER_SCALE"
-    num_warps = 8 if hopper_swizzling else opt_flags_nvidia.compute_num_warps(block_m, block_n)
+    num_warps = opt_flags_nvidia.compute_num_warps(block_m, block_n, precision_config)
     ret = OptFlags(
         block_m=block_m,
         block_n=block_n,
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags_details/opt_flags_nvidia.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags_details/opt_flags_nvidia.py
@@ -1,7 +1,8 @@
 import torch
 import triton
 from triton_kernels import target_info
-from triton_kernels.tensor import bitwidth, FP4
+from triton_kernels.tensor import get_layout, bitwidth, FP4
+from triton_kernels.tensor_details.layout import HopperMXScaleLayout
 from triton_kernels.numerics_details.mxfp_details._downcast_to_mxfp import MXFP_BLOCK_SIZE
 
 
@@ -16,19 +17,20 @@ def compute_grid_size(routing_data, m, n, block_m, block_n):
 
 def compute_block_n(n: int, arch, precision_config):
     # block_n:
-    block_n = max(16, min(128, triton.next_power_of_2(n)))
-    # On Ampere and Hopper, handshake with swizzle_mxfp4_scale_hopper
-    if precision_config.max_num_imprecise_acc is None and n > 128:
-        block_n = 256
-    return block_n
+    layout = get_layout(precision_config.weight_scale)
+    if isinstance(layout, HopperMXScaleLayout) and layout.num_warps == 4:
+        return 128
+    elif precision_config.max_num_imprecise_acc is None and n > 128:
+        return 256
+    else:
+        return max(16, min(128, triton.next_power_of_2(n)))
 
 
-def compute_block_k(k: int | None, is_persistent: bool, lhs_dtype, rhs_dtype, precision_config):
+def compute_block_k(m: int, k: int | None, is_persistent: bool, lhs_dtype, rhs_dtype, precision_config):
     lhs_width = bitwidth(lhs_dtype)
     rhs_width = bitwidth(rhs_dtype)
     # block_k needs to match the cacheline size (1024 bits)
     block_k = int(1024 // min(lhs_width, rhs_width))
-    # TODO: revisit when Triton is better for H100 + MXFP4
     has_native_mxfp = target_info.cuda_capability_geq(10, 0)
     if rhs_width == 4 and not has_native_mxfp:
         block_k = 128
@@ -52,7 +54,10 @@ def compute_split_k(block_k: int, k: int | None, grid_size: int) -> int:
     return split_k
 
 
-def compute_num_warps(block_m, block_n):
+def compute_num_warps(block_m, block_n, precision_config):
+    layout = get_layout(precision_config.weight_scale)
+    if isinstance(layout, HopperMXScaleLayout):
+        return layout.num_warps
     return max(block_m * block_n // 4096, 4)
 
 
diff --git a/python/triton_kernels/triton_kernels/tensor.py b/python/triton_kernels/triton_kernels/tensor.py
@@ -172,7 +172,9 @@ def sum(self, partials_block_size):
         return sum_bitmatrix_rows(self, out_ret, partials_block_size)
 
 
-def get_layout(tensor: torch.Tensor | Tensor):
+def get_layout(tensor: torch.Tensor | Tensor | None):
+    if tensor is None:
+        return None
     if isinstance(tensor, Tensor):
         return tensor.storage.layout
     return StridedLayout
@@ -186,11 +188,11 @@ def wrap_torch_tensor(torch_tensor, dtype=None):
     return Tensor(Storage(torch_tensor), dtype=dtype, shape=shape)
 
 
-def convert_layout(tensor: Tensor, layout_cls: Type[Layout]):
+def convert_layout(tensor: Tensor, layout_cls: Type[Layout], **layout_kwargs):
     assert isinstance(tensor, Tensor)
     old_storage = tensor.storage
     old_data = old_storage.layout.unswizzle_data(old_storage.data)
-    new_layout = layout_cls(old_data.shape)
+    new_layout = layout_cls(old_data.shape, **layout_kwargs)
     new_data = new_layout.swizzle_data(old_data)
     attrs = {k.name: getattr(tensor, k.name) for k in fields(tensor) if k.name != "storage"}
     return Tensor(Storage(new_data, new_layout), **attrs)
diff --git a/python/triton_kernels/triton_kernels/tensor_details/layout.py b/python/triton_kernels/triton_kernels/tensor_details/layout.py
@@ -3,6 +3,7 @@
 from .layout_details.hopper_scale import HopperMXScaleLayout
 from .layout_details.hopper_value import HopperMXValueLayout
 from .layout_details.strided import StridedLayout
+from ..target_info import cuda_capability_geq
 
 __all__ = [
     "Layout",
@@ -11,3 +12,21 @@
     "HopperMXValueLayout",
     "StridedLayout",
 ]
+
+
+def make_default_matmul_mxfp4_w_layout(mx_axis: int):
+    if cuda_capability_geq(10):
+        return StridedLayout, dict()
+    elif cuda_capability_geq(9):
+        return HopperMXValueLayout, {"mx_axis": mx_axis}
+    else:
+        return StridedLayout, dict()
+
+
+def make_default_matmul_mxfp4_w_scale_layout(mx_axis: int, num_warps: int = 8):
+    if cuda_capability_geq(10):
+        return BlackwellMXScaleLayout, dict()
+    elif cuda_capability_geq(9):
+        return HopperMXScaleLayout, {"mx_axis": mx_axis, "num_warps": num_warps}
+    else:
+        return StridedLayout, dict()
diff --git a/python/triton_kernels/triton_kernels/tensor_details/layout_details/hopper_scale.py b/python/triton_kernels/triton_kernels/tensor_details/layout_details/hopper_scale.py
diff --git a/python/triton_kernels/triton_kernels/tensor_details/layout_details/hopper_value.py b/python/triton_kernels/triton_kernels/tensor_details/layout_details/hopper_value.py