[Kernels] Support hopper hbm swizzling in persistent matmul (#8917)

peterbell10 · web-flow · commit 14373ae24fcf · 2025-12-08T11:24:24.000Z
diff --git a/python/triton_kernels/triton_kernels/matmul.py b/python/triton_kernels/triton_kernels/matmul.py
@@ -11,6 +11,7 @@
 from triton_kernels import target_info
 from triton_kernels.numerics import InFlexData, OutFlexData
 from triton_kernels.target_info import is_cuda
+from triton_kernels.tensor_details.layout_details.hopper_scale import HopperMXScaleLayout
 # details
 from .matmul_details._matmul import _matmul
 from .matmul_details._p_matmul import _p_matmul, get_per_device_per_stream_alloc_fn
@@ -104,6 +105,8 @@ class PrecisionConfig:
 def get_swap_xw(precision_config, opt_flags):
     if target_info.cuda_capability_geq(10, 0):
         return precision_config.b_mx_scale is not None and opt_flags.block_m <= 64 and opt_flags.is_persistent
+    elif target_info.cuda_capability_geq(9, 0):
+        return precision_config.b_mx_scale is not None and opt_flags.is_persistent
 
     return False
 
@@ -296,8 +299,6 @@ def matmul(a, b, bias,
         # Currently we don't support tma if y is column major; may revisit later if this becomes an issue.
         (c is None or c.stride(-1) == 1) and
         (c_acc_in is None or c_acc_is_c) and
-        # for simulated MXFP, not supported
-        (b_scale is None or target_info.has_native_mxfp()) and
         # if ragged dimension is K, w must be either padded or row major to ensure alignment
         (ragged_dimension != "K" or b.stride(-1) == 1 or b_ragged_metadata.slice_sizes_divisibility is not None)
     )
@@ -308,8 +309,6 @@ def matmul(a, b, bias,
         # which is too big.
         can_use_tma = False
     has_gather_tma = has_gather and target_info.has_tma_gather()
-    # hopper w/ mxfp4 doesn't support TMA
-    can_use_tma = can_use_tma and (torch.cuda.get_device_capability()[0] > 9 or bitwidth(b.dtype) != 4)
     can_use_split_k = scatter_indx is None and not a_has_mx and not b_has_mx and ragged_dimension != "K"
     block_k = None
     if ragged_dimension == "K":
@@ -338,8 +337,6 @@ def matmul(a, b, bias,
         assert K == K_W
         a_has_tma = opt_flags.is_persistent and (has_gather_tma or not has_gather)
         even_K = (K % opt_flags.block_k == 0)
-    if b_scale is not None and opt_flags.is_persistent and not target_info.has_native_mxfp():
-        raise NotImplementedError("Must use non-persistent kernel for simulated MXFP")
     if b_scale is not None and b_scale.storage.layout.name is not None and not opt_flags.is_persistent and target_info.has_native_mxfp():
         raise NotImplementedError("Must use persistent kernel and be TMA-compliant for native MXFP")
     # fused activation
@@ -425,8 +422,8 @@ def matmul(a, b, bias,
     if b_scale_has_tma:
         scale_block_k = opt_flags.block_k // int(MXFP_BLOCK_SIZE)
         b_scale_storage = b_scale.storage
-        b_scale_tma_block_size = [opt_flags.block_n, scale_block_k] if b_transpose else [scale_block_k, opt_flags.block_n]
-        if isinstance(b_scale.storage.layout, StridedLayout):
+        b_scale_tma_block_size = [scale_block_k, opt_flags.block_n]
+        if isinstance(b_scale_storage.layout, (StridedLayout, HopperMXScaleLayout)):
             b_scale_storage = _canonicalize_storage(b_scale.storage, 3, None)
             b_scale_tma_block_size = [1] + b_scale_tma_block_size
         b_scale_tensor_or_tma = b_scale_storage.make_tma(b_scale_tma_block_size, "dense", is_scale=True)
diff --git a/python/triton_kernels/triton_kernels/matmul_details/_matmul.py b/python/triton_kernels/triton_kernels/matmul_details/_matmul.py
@@ -116,7 +116,6 @@ def _matmul(
         tl.static_assert(BLOCK_K % MX_PACK_DIVISOR == 0, f"{BLOCK_K=} must be a multiple of {MX_PACK_DIVISOR=}")
         tl.static_assert(SWIZZLE_MX_VALUE == "HOPPER_VALUE" or SWIZZLE_MX_VALUE is None, "Only Hopper swizzling is supported for values")
 
-        # TODO: refactor if/else when triton front end improves
         if SWIZZLE_MX_VALUE == "HOPPER_VALUE":
             tl.static_assert(is_w_mxfp4, "Only mxfp4 is supported for HOPPER swizzling")
             tl.static_assert(not is_x_microscaled)
diff --git a/python/triton_kernels/triton_kernels/matmul_details/_p_matmul.py b/python/triton_kernels/triton_kernels/matmul_details/_p_matmul.py
@@ -13,6 +13,8 @@
     compute_scale,
 )
 from triton_kernels.numerics_details.mxfp_details._downcast_to_mxfp import MXFP_BLOCK_SIZE
+from triton_kernels.tensor_details.layout_details.hopper_scale import unswizzle_mxfp4_scale_hopper
+from triton_kernels.tensor_details.layout_details.hopper_value import mxfp4_to_bf16_triton
 from ._common import (
     compute_offsets,
     get_scaled_dot_format_string,
@@ -112,25 +114,48 @@ def _p_matmul(
     if Y_TMA_MODE is not None:
         Y = tl.make_tensor_descriptor(YPtr, Y.shape, Y.strides[:-1] + (1,), Y.block_shape)
 
+    w_type: tl.constexpr = get_dtype(W)
     is_w_microscaled: tl.constexpr = WMxScale is not None
+    is_x_microscaled: tl.constexpr = XMxScale is not None
+    is_w_mxfp4: tl.constexpr = w_type == tl.uint8 and is_w_microscaled
     tl.static_assert(not is_w_microscaled or W_TRANSPOSE, "NYI. Non-transposed mxfp4 weights")
     MX_PACK_DIVISOR: tl.constexpr = MXFP_BLOCK_SIZE
     if is_w_microscaled:
-        w_type: tl.constexpr = get_dtype(W)
         tl.static_assert(w_type == tl.uint8 or (w_type == tl.float8e4nv or w_type == tl.float8e5),
                          "mx_weight_ptr must be uint8 or fp8")
         tl.static_assert(get_dtype(WMxScale) == tl.uint8, "mx_scale_ptr must be uint8")
         tl.static_assert(BLOCK_K % MX_PACK_DIVISOR == 0, "BLOCK_K must be a multiple of MX_PACK_DIVISOR")
-        tl.static_assert(SWIZZLE_MX_SCALE == "BLACKWELL_SCALE" or SWIZZLE_MX_SCALE is None, "Only Blackwell swizzling is supported for scales")
 
         # We have pack 2 fp4 values in a byte
-        W_PACK_DIVISOR: tl.constexpr = 2 if w_type == tl.uint8 else 1
-        PACKED_BLOCK_K_W: tl.constexpr = BLOCK_K // W_PACK_DIVISOR
         MX_SCALE_BLOCK_K: tl.constexpr = BLOCK_K // MX_PACK_DIVISOR
+        if SWIZZLE_MX_VALUE == "HOPPER_VALUE":
+            tl.static_assert(is_w_mxfp4, "Only mxfp4 is supported for HOPPER swizzling")
+            tl.static_assert(not is_x_microscaled)
+            # We have pack 2 fp4 values in a byte but we divide the dimension by 2
+            # when swizzling
+            W_K_DIVISOR: tl.constexpr = 1
+            W_K_MULTIPLIER: tl.constexpr = 2
+            W_N_DIVISOR: tl.constexpr = 4
+        else:
+            # We have pack 2 fp4 values in a byte
+            W_K_DIVISOR: tl.constexpr = 2 if is_w_mxfp4 else 1
+            W_K_MULTIPLIER: tl.constexpr = 1
+            W_N_DIVISOR: tl.constexpr = 1
+
+        if W_TRANSPOSE:
+            # When weight is transposed, 2 fp4 values are packed per Byte along
+            # the contiguous dimension, K.
+            PACKED_BLOCK_K_W: tl.constexpr = (BLOCK_K // W_K_DIVISOR) * W_K_MULTIPLIER
+            PACKED_BLOCK_N_W: tl.constexpr = BLOCK_N // W_N_DIVISOR
+        else:
+            # When weight is not transposed, fp4 values are *not* packed along
+            # the contiguous dimension, N.
+            PACKED_BLOCK_K_W: tl.constexpr = BLOCK_K
+            PACKED_BLOCK_N_W: tl.constexpr = BLOCK_N // W_K_DIVISOR
     else:
         PACKED_BLOCK_K_W: tl.constexpr = BLOCK_K
+        PACKED_BLOCK_N_W: tl.constexpr = BLOCK_N
         tl.static_assert(SWIZZLE_MX_SCALE is None)
-    is_x_microscaled: tl.constexpr = XMxScale is not None
     if is_x_microscaled:
         x_type: tl.constexpr = get_dtype(X)
         tl.static_assert(x_type == tl.float8e4nv, "mx_act_ptr must be float8e4nv")
@@ -202,6 +227,7 @@ def _p_matmul(
         else:
             shape_m = M
         off_n = BLOCK_N * pid_n
+        off_w_n = PACKED_BLOCK_N_W * pid_n
 
         # ---- offset x ------
         if USE_GATHER_TMA:
@@ -283,7 +309,7 @@ def _p_matmul(
             x_format: tl.constexpr = get_scaled_dot_format_string(x.dtype)
             if is_x_microscaled:
                 if XMxScalePtrs is not None: # not using TMA for x scale load
-                    off_k_mx = off_k_w // (MX_PACK_DIVISOR // W_PACK_DIVISOR)
+                    off_k_mx = off_k_w // (MX_PACK_DIVISOR // W_K_DIVISOR)
                     if EVEN_K:
                         mask_k_scale = tl.full([MX_SCALE_BLOCK_K], True, dtype=tl.int1)
                     else:
@@ -306,30 +332,47 @@ def _p_matmul(
 
             # --- load w ---
             if W_TRANSPOSE:
-                w = tl.reshape(W.load([off_w_z, off_n, off_k_w]), W.block_shape[1:]).T
+                w = tl.reshape(W.load([off_w_z, off_w_n, off_k_w]), W.block_shape[1:]).T
             else:
-                w = tl.reshape(W.load([off_w_z, off_k_w, off_n]), W.block_shape[1:])
+                w = tl.reshape(W.load([off_w_z, off_k_w, off_w_n]), W.block_shape[1:])
 
             # --- load w_scale ---
             w_format: tl.constexpr = get_scaled_dot_format_string(w.dtype)
             if is_w_microscaled:
-                off_k_mx = off_k_w // (MX_PACK_DIVISOR // W_PACK_DIVISOR)
-                tl.static_assert(MX_PACK_DIVISOR % W_PACK_DIVISOR == 0)
+                off_k_mx = off_k_w // (MX_PACK_DIVISOR // W_K_DIVISOR)
+                tl.static_assert(MX_PACK_DIVISOR % W_K_DIVISOR == 0)
                 if SWIZZLE_MX_SCALE == "BLACKWELL_SCALE":
                     flattened_expt_n_idx = off_w_z * ((N + 127) // 128) + (off_n // 128)
                     w_scales = WMxScale.load([0, flattened_expt_n_idx, off_k_mx // 4, 0, 0])
                     w_scales = w_scales.reshape((w_scales.shape[1], w_scales.shape[2] * w_scales.shape[-2] * w_scales.shape[-1]))
                     w_scales = unswizzle_mx_scale_bw(w_scales)
+                elif SWIZZLE_MX_SCALE == "HOPPER_SCALE":
+                    # NYI: Hopper swizzling with non-transposed W
+                    tl.static_assert(W_TRANSPOSE)
+                    off_n_scale = pid_n * (BLOCK_N // 32)
+                    off_k_scale = (off_k_w // PACKED_BLOCK_K_W) * MX_SCALE_BLOCK_K * 32
+                    w_scales = WMxScale.load([off_w_z, off_n_scale, off_k_scale])
+                    w_scales = tl.reshape(w_scales, *w_scales.shape[1:])
+                    num_warps: tl.constexpr = tl.extra.cuda.num_warps()
+                    w_scales = unswizzle_mxfp4_scale_hopper(w_scales, mx_axis=1, num_warps=num_warps)
                 else:
                     w_scales = WMxScale.load([off_w_z, off_k_mx, off_n])
                     w_scales = tl.reshape(w_scales, *w_scales.shape[1:]).T
 
             # --- update accumulator ---
             if is_w_microscaled:
-                if SWAP_XW:
-                    acc = tl.dot_scaled(w.T, w_scales, w_format, x.T, x_scales, x_format, acc=acc, fast_math=True)
+                if SWIZZLE_MX_VALUE == "HOPPER_VALUE":
+                    tl.static_assert(x_format == "bf16")
+                    tl.static_assert(w_format == "e2m1")
+                    tl.static_assert(SWAP_XW)
+                    wT = mxfp4_to_bf16_triton(w.T, w_scales, mx_axis=1)
+                    tl.static_assert(wT.dtype == tl.bfloat16)
+                    acc = tl.dot(wT, x.T, acc, max_num_imprecise_acc=MAX_NUM_IMPRECISE_ACC, allow_tf32=ALLOW_TF32)
                 else:
-                    acc = tl.dot_scaled(x, x_scales, x_format, w, w_scales, w_format, acc=acc, fast_math=True)
+                    if SWAP_XW:
+                        acc = tl.dot_scaled(w.T, w_scales, w_format, x.T, x_scales, x_format, acc=acc, fast_math=True)
+                    else:
+                        acc = tl.dot_scaled(x, x_scales, x_format, w, w_scales, w_format, acc=acc, fast_math=True)
             else:
                 if SWAP_XW:
                     acc = tl.dot(w.T, x.T, acc, max_num_imprecise_acc=MAX_NUM_IMPRECISE_ACC, allow_tf32=ALLOW_TF32)
diff --git a/python/triton_kernels/triton_kernels/matmul_details/opt_flags.py b/python/triton_kernels/triton_kernels/matmul_details/opt_flags.py
@@ -7,6 +7,7 @@
 from triton_kernels.target_info import get_cdna_version
 from triton_kernels.tensor import FP4
 import torch
+from triton_kernels.tensor_details.layout_details.hopper_scale import HopperMXScaleLayout
 from .opt_flags_details import opt_flags_amd, opt_flags_nvidia
 from triton_kernels.tensor import bitwidth, get_layout
 
@@ -239,9 +240,15 @@ def make_default_opt_flags_nvidia(
         # TMA is slower for batched matmuls with small m/n/k.
         if m * n * k < 131072:
             is_persistent = False
+        if (
+            (b_scale_layout := get_layout(precision_config.b_mx_scale)) is not None and
+            isinstance(b_scale_layout, HopperMXScaleLayout)
+        ):
+            # TODO: persistent kernel is currently slower than non-persistent
+            is_persistent = False
     # adjust block_n based on is_persistent signal
     block_n = block_n_tma if is_persistent else block_n
-    # adjut block_m based on is_persistent signal
+    # adjust block_m based on is_persistent signal
     if is_persistent and opt_flags_nvidia.is_x_scale_swizzled(precision_config):
         # a mx scale has been swizzled to BlackwellActMXScaleLayout, enforce block_m=128 to align with swizzling layout
         block_m = 128
diff --git a/python/triton_kernels/triton_kernels/tensor_details/layout_details/blackwell_scale.py b/python/triton_kernels/triton_kernels/tensor_details/layout_details/blackwell_scale.py
@@ -56,8 +56,9 @@ def unswizzle_data(self, data):
         return data[..., :self.K, :self.N]
 
     def swizzle_block_shape(self, block_shape):
-        assert block_shape[0] >= 128, f"{block_shape[0]=} must be >= 128"
-        return [1, block_shape[0] // 128, block_shape[1] // 4, 2, 256]
+        K, N = block_shape
+        assert N >= 128, f"{block_shape[1]=} must be >= 128"
+        return [1, N // 128, K // 4, 2, 256]
 
 
 @triton.jit
diff --git a/python/triton_kernels/triton_kernels/tensor_details/layout_details/hopper_scale.py b/python/triton_kernels/triton_kernels/tensor_details/layout_details/hopper_scale.py
@@ -65,7 +65,9 @@ def unswizzle_data(self, data):
         return data[..., :self.M, :self.K]
 
     def swizzle_block_shape(self, block_shape):
-        return block_shape
+        N, K = block_shape[-2:]
+        assert N % 32 == 0
+        return [*block_shape[:-2], N // 32, K * 32]
 
 
 @triton.jit
diff --git a/python/triton_kernels/triton_kernels/tensor_details/layout_details/hopper_value.py b/python/triton_kernels/triton_kernels/tensor_details/layout_details/hopper_value.py
@@ -3,6 +3,8 @@
 import triton.language as tl
 from dataclasses import dataclass
 from .base import Layout
+
+from triton_kernels.numerics_details.mxfp_details._downcast_to_mxfp import MXFP_BLOCK_SIZE
 from triton_kernels.target_info import cuda_capability_geq
 
 
@@ -211,7 +213,9 @@ def unswizzle_data(self, data):
         return data[..., :self.K, :self.N]
 
     def swizzle_block_shape(self, block_shape):
-        return block_shape
+        N, K = block_shape[-2:]
+        assert N % 4 == 0
+        return [*block_shape[:-2], N // 4, K * 4]
 
 
 @triton.jit
@@ -329,9 +333,15 @@ def mxfp4_to_bf16_triton(x, scale, mx_axis: tl.constexpr):
         is_pure=True,
         pack=4,
     )
+    # Sanity check shape
+    for axis in tl.static_range(len(x.shape)):
+        if axis == mx_axis:
+            tl.static_assert(x.shape[axis] == MXFP_BLOCK_SIZE * scale.shape[axis])
+        else:
+            tl.static_assert(x.shape[axis] == scale.shape[axis])
     # Broadcast scale
     scale = scale.expand_dims(mx_axis + 1)
-    scale = scale.broadcast_to(scale.shape[:mx_axis + 1] + [32] + scale.shape[mx_axis + 2:])
+    scale = scale.broadcast_to(scale.shape[:mx_axis + 1] + [MXFP_BLOCK_SIZE] + scale.shape[mx_axis + 2:])
     scale = scale.reshape(x.shape)
 
     # Combine scale and x