[kernels] use more host TMA for X, W, Mx in persistent matmul (#7182)

aeng-openai · web-flow · commit 9326a2d94ce9 · 2025-06-17T16:34:43.000Z
host TMA is used for X when it is loaded, not gathered
host TMA is used always for W and Mx scales
gather TMA stays device-side as this performs better than host TMAs

also, some fixes in the epilogue for subtiling of the bias tensor. Load
chunks of it separately instead of loading all of it then splitting it;
this reduces spilling.
diff --git a/python/triton/tools/tensor_descriptor.py b/python/triton/tools/tensor_descriptor.py
@@ -1,6 +1,8 @@
 from dataclasses import dataclass
 from typing import List, Any
 from triton._utils import validate_block_shape
+from torch._subclasses.fake_tensor import FakeTensor
+from torch._subclasses.functional_tensor import FunctionalTensor
 
 
 @dataclass
@@ -16,7 +18,8 @@ def __post_init__(self):
         assert len(self.block_shape) == rank, f"rank mismatch: {self}"
         assert rank > 0, "rank must not be zero"
         assert rank <= 5, "rank cannot be more than 5"
-        assert self.base.data_ptr() % 16 == 0, "base must be 16-byte aligned"
+        if not isinstance(self.base, (FakeTensor, FunctionalTensor)):
+            assert self.base.data_ptr() % 16 == 0, "base must be 16-byte aligned"
         validate_block_shape(self.block_shape)
         elem_bytes = self.base.dtype.itemsize
         for stride in self.strides[:-1]:
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs.py
@@ -1,6 +1,5 @@
 from dataclasses import dataclass
 import itertools
-import math
 import sys
 import torch
 import triton
@@ -121,20 +120,19 @@ def create_weight_descriptor(w_tensor: torch.Tensor, block_k: int, block_n: int,
                                                                transpose=transpose)
 
     @staticmethod
-    def create_block_scale_descriptor(mx_tensor: torch.Tensor, block_k: int, block_n: int, K: int, N: int,
-                                      mx_scale_stride_k: int, mx_scale_stride_n: int, n_expts_tot: int, batch_size: int,
-                                      expt_data: Optional[ExptData], swizzle_mx: bool,
-                                      transpose: bool) -> TensorDescriptor:
+    def create_block_scale_descriptor(mx_tensor: torch.Tensor, block_k: int, block_n: int, B: int, K: int, N: int,
+                                      mx_scale_stride_k: int, mx_scale_stride_n: int, swizzle_mx: bool,
+                                      transpose: Optional[bool]) -> TensorDescriptor:
         """Create a tensor descriptor for block scale factors"""
         MX_PACK_DIVISOR = 32
         MX_SCALE_BLOCK_K = block_k // MX_PACK_DIVISOR
         PackedK = (K + MX_PACK_DIVISOR - 1) // MX_PACK_DIVISOR
 
         if swizzle_mx:
-            num_expt_x_ncol = (n_expts_tot if expt_data is not None and len(expt_data.block_pid_map) > 0 else
-                               batch_size) * ((N + 127) // 128)
+            assert transpose is None
+            num_expt_x_ncol = B * triton.cdiv(N, 128)
             return TensorDescriptor(
-                base=mx_tensor, shape=[1, num_expt_x_ncol, (PackedK + 3) // 4, 2, 256],
+                base=mx_tensor, shape=[1, num_expt_x_ncol, triton.cdiv(PackedK, 4), 2, 256],
                 strides=[num_expt_x_ncol * mx_scale_stride_n, mx_scale_stride_n, mx_scale_stride_k, 256,
                          1], block_shape=[1, block_n // 128, MX_SCALE_BLOCK_K // 4, 2, 256])
         else:
@@ -151,35 +149,12 @@ def squeeze_after_dim(x, dim=2):
         return x.view(*new_shape)
 
     @staticmethod
-    def create_input_descriptor_gather(x_tensor: torch.Tensor, K: int, x_stride_1: int, x_stride_2: int,
-                                       block_k: int) -> TensorDescriptor:
-        """Create a tensor descriptor for input matrix X via TMA gather"""
-        x_desc = TensorDescriptorBuilder.squeeze_after_dim(x_tensor)
-        assert x_desc.ndim == 2, "TMA gather descriptor requires 2D input"
-        INT_MAX = 2147483647
-        return TensorDescriptor(base=x_desc, shape=[INT_MAX, K], strides=[x_stride_1, x_stride_2],
-                                block_shape=[1, block_k])
-
-    @staticmethod
-    def create_input_descriptor_load(x_tensor: torch.Tensor, K: int, x_stride_1: int, x_stride_2: int, block_m: int,
-                                     block_k: int) -> TensorDescriptor:
-        """Create a tensor descriptor for input matrix X via TMA"""
-        x_desc = TensorDescriptorBuilder.squeeze_after_dim(x_tensor)
-        assert x_desc.ndim in [2, 3], "LHS input TMA descriptor builder expects 2D or 3D input"
-        return TensorDescriptor(base=x_desc, shape=[x_desc.shape[0], K], strides=[x_stride_1, x_stride_2],
-                                block_shape=[block_m, block_k])
-
-    @staticmethod
-    def create_input_descriptor(x_tensor: torch.Tensor, K: int, x_stride_1: int, x_stride_2: int, block_k: int,
-                                block_m: int, use_gather_tma: bool, use_load_tma: bool) -> TensorDescriptor:
-        """Create a tensor descriptor for input matrix X based on TMA usage"""
-        if use_gather_tma:
-            return TensorDescriptorBuilder.create_input_descriptor_gather(x_tensor, K, x_stride_1, x_stride_2, block_k)
-        elif use_load_tma:
-            return TensorDescriptorBuilder.create_input_descriptor_load(x_tensor, K, x_stride_1, x_stride_2, block_m,
-                                                                        block_k)
-        else:
-            return x_tensor
+    def create_descriptor(x_tensor: torch.Tensor, block_m: int, block_k: int) -> TensorDescriptor:
+        """Create a tensor descriptor for matrix X via TMA"""
+        x_tensor = TensorDescriptorBuilder.squeeze_after_dim(x_tensor)
+        assert x_tensor.ndim in [2, 3], "TMA descriptor builder expects 2D or 3D input"
+        block_shape = [1] * (x_tensor.ndim - 2) + [block_m, block_k]
+        return TensorDescriptor.from_tensor(x_tensor, block_shape=block_shape)
 
 
 # ---------------------
@@ -590,66 +565,53 @@ def _create_tma_descriptors(
     mx_ctx: MicroscalingCtx,
     expt_data: ExptData,
     opt_flags: OptFlags,
-    batch_size: int,
+    B: int,
     K: int,
     N: int,
     mx_scale_stride_k: int,
     mx_scale_stride_n: int,
-    USE_GATHER_TMA: bool,
-    X_USE_LOAD_TMA: bool,
-    w_transpose: bool,
-    mx_transpose: bool,
+    HAS_GATHER: bool,
 ) -> Tuple[bool, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
     """Create and cache TMA descriptors for tensors."""
-    use_host_tma_descriptors = opt_flags.is_persistent and target_info.cuda_capability_geq(10, 0)
-
-    x_desc, w_desc = [None] * 2
-    descriptors = []
-    # The dense case currently uses on device descriptor updates
-    # so we bail out on using host descriptors in that case
-    if (use_host_tma_descriptors):
-        if USE_GATHER_TMA or X_USE_LOAD_TMA:
-            x_desc = TensorDescriptorBuilder.create_input_descriptor(
-                    x, K, x.stride(1), x.stride(2),
-                    opt_flags.block_k, opt_flags.block_m,
-                    USE_GATHER_TMA, X_USE_LOAD_TMA
-                )
-        descriptors.append(x_desc)
-        if (expt_data is not None and len(expt_data.block_pid_map) > 0):
-            w_desc = TensorDescriptorBuilder.create_weight_descriptor(
-                    w, opt_flags.block_k, opt_flags.block_n, w_transpose
-                )
-            is_microscaled_format = (mx_ctx.weight_scale is not None) and (w.dtype == torch.uint8)
-            if is_microscaled_format:
-                # Pad the inner shape to 128 for mxfp4 weights
-                # for mixed precision fp8 x mxfp4 compute
-                pad = 128
-                dim_to_pad = -1
-                old_size = w_desc.shape[dim_to_pad]
-                padded_size = math.ceil(old_size / pad) * pad
-                if padded_size != old_size:
-                    w_desc.shape = list(w_desc.shape)
-                    w_desc.shape[dim_to_pad] = padded_size
-        descriptors.append(w_desc)
-        # Optional MX scale descriptor
-        descriptors.append(None)
-        if mx_tensor is not None:
-            descriptors[-1] = TensorDescriptorBuilder.create_block_scale_descriptor(
-                    mx_tensor, opt_flags.block_k, opt_flags.block_n, K, N,
-                    mx_scale_stride_k, mx_scale_stride_n, routing_data.n_expts_tot,
-                    batch_size,
-                    expt_data, mx_ctx.swizzle_scale, mx_transpose
-                )
 
-    # TODO: Currently all or none, instead should support a mixture
-    # of host and device descriptors
-    if None in descriptors or len(descriptors) == 0:
-        descriptors = [x, w, mx_tensor]
-        use_host_tma_descriptors = False
-    if opt_flags.is_persistent:
-        opt_flags.target_kernel_kwargs["USE_HOST_TMA_DESCRIPTORS"] = use_host_tma_descriptors
+    x_tensor_or_desc, mx_desc_and_transpose = x, (None, False)
 
-    return use_host_tma_descriptors, *descriptors
+    if not HAS_GATHER:
+        x_tensor_or_desc = TensorDescriptorBuilder.create_descriptor(x, opt_flags.block_m, opt_flags.block_k)
+
+    w_transpose = w.stride(2) != 1
+    w_desc = TensorDescriptorBuilder.create_weight_descriptor(
+            w, opt_flags.block_k, opt_flags.block_n, w_transpose
+        )
+    w_desc_and_transpose = (w_desc, w_transpose)
+
+    is_microscaled_format = mx_ctx.weight_scale is not None and w.dtype == torch.uint8
+    if is_microscaled_format:
+        # Pad the inner shape to 128 for mxfp4 weights; TMA requires this when the compiler uses
+        # CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B.
+        # This technically makes the shape masking incorrect, but it's fine because:
+        #  - When the N dim is padded, the scales will be masked to 0.
+        #  - When the K dim is padded, the activations we perform tl.dot with will be masked to 0.
+        #    Note: the scales can't be relied on for zeroing in this case, because they apply to groups
+        #    of 32 elements in the K dimension.
+        pad = 128
+        dim_to_pad = -1
+        old_size = w_desc.shape[dim_to_pad]
+        padded_size = triton.cdiv(old_size, pad) * pad
+        if padded_size != old_size:
+            w_desc.shape = list(w_desc.shape)
+            w_desc.shape[dim_to_pad] = padded_size
+
+    if mx_tensor is not None:
+        mx_transpose = mx_scale_stride_n != 1 if mx_ctx.swizzle_scale is None else None
+        mx_desc = TensorDescriptorBuilder.create_block_scale_descriptor(
+                mx_tensor, opt_flags.block_k, opt_flags.block_n,
+                routing_data.n_expts_tot if expt_data is not None and len(expt_data.block_pid_map) > 0 else B, K, N,
+                mx_scale_stride_k, mx_scale_stride_n, mx_ctx.swizzle_scale, mx_transpose
+            )
+        mx_desc_and_transpose = (mx_desc, mx_transpose)
+
+    return x_tensor_or_desc, w_desc_and_transpose, mx_desc_and_transpose
 
 
 def matmul_ogs(x, w, bias,
@@ -754,41 +716,39 @@ def matmul_ogs(x, w, bias,
     expt_token_offs_raw = None if expt_data is None else expt_data.token_offs_raw
     expt_block_pid_map = None if expt_data is None else expt_data.block_pid_map[block_m]
 
-    HAS_TMA_GS = target_info.cuda_capability_geq(10, 0)
-    USE_GATHER_TMA = HAS_TMA_GS and gather_indx is not None
-    X_USE_LOAD_TMA = gather_indx is None and not USE_GATHER_TMA
-    _, x_tensor, w_tensor, mx_tensor = _create_tma_descriptors(
-        x=x, w=w,
-        mx_tensor=mx_ctx.weight_scale,
-        routing_data=routing_data,
-        mx_ctx=mx_ctx,
-        expt_data=expt_data,
-        opt_flags=opt_flags,
-        batch_size=batch_size,
-        K=K,
-        N=N,
-        mx_scale_stride_k=mx_scale_stride_k,
-        mx_scale_stride_n=mx_scale_stride_n,
-        USE_GATHER_TMA=USE_GATHER_TMA,
-        X_USE_LOAD_TMA=X_USE_LOAD_TMA,
-        w_transpose=w.stride(2) != 1,
-        mx_transpose=mx_scale_stride_n != 1,
-    )
+    if opt_flags.is_persistent:
+        x_tensor, w_tensor_and_transpose, mx_tensor_and_tranpose = _create_tma_descriptors(
+            x=x, w=w, mx_tensor=mx_ctx.weight_scale,
+            routing_data=routing_data,
+            mx_ctx=mx_ctx,
+            expt_data=expt_data,
+            opt_flags=opt_flags,
+            B=batch_size,
+            K=K,
+            N=N,
+            mx_scale_stride_k=mx_scale_stride_k,
+            mx_scale_stride_n=mx_scale_stride_n,
+            HAS_GATHER=gather_indx is not None,
+        )
+        w_tensor, w_tma_transpose = w_tensor_and_transpose
+        mx_tensor, mx_tma_transpose = mx_tensor_and_tranpose
+    else:
+        x_tensor = x
+        w_tensor, w_tma_transpose = w, False
+        mx_tensor, mx_tma_transpose = mx_ctx.weight_scale, False
     if isinstance(x_tensor, torch.Tensor):
         x_tensor = flex.lhs_data.reinterpret(x)
     if isinstance(w_tensor, torch.Tensor):
         w_tensor = flex.rhs_data.reinterpret(w)
     (kernels._p_matmul_ogs if opt_flags.is_persistent else kernels._matmul_ogs)[(n_cta,)](
                    flex.out_data.reinterpret(memory["output"]),
-                   flex.out_data.reinterpret(out0), *out0.stride(),
-                   *out0_flex,
+                   flex.out_data.reinterpret(out0), *out0.stride(), *out0_flex,
                    x_tensor, x.stride(0), x.stride(1), x.stride(2),
                    flex.lhs_data.scale,
-                   w_tensor, w.stride(0), w.stride(1), w.stride(2), w.stride(2) != 1,
+                   w_tensor, w.stride(0), w.stride(1), w.stride(2), w_tma_transpose,
                    flex.rhs_data.scale,
-                   mx_tensor, mx_scale_stride_e, mx_scale_stride_k, mx_scale_stride_n, mx_scale_stride_n != 1,
+                   mx_tensor, mx_scale_stride_e, mx_scale_stride_k, mx_scale_stride_n, mx_tma_transpose,
                    bias, bias_stride,
-                   x.shape[1],
                    x.shape[1] if routing_data.expt_hist is None else None,
                    N, K,
                    betas, gammas,
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/_matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/_matmul_ogs.py
@@ -39,7 +39,7 @@ def _matmul_ogs(
              WScale,
              MxScale, stride_mx_e, stride_mx_k, stride_mx_n, MX_TRANSPOSE: tl.constexpr,
              B, stride_b_e, # Bias
-             NRows, M, N, K, # shapes
+             M, N, K, # shapes
              # expt data
              Betas, Gammas,
              GatherIndx,
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/_p_matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/_p_matmul_ogs.py