[release/3.4] Cherry-pick triton-lang#7182 (triton-lang#7437)

dongfengy · aeng-openai · web-flow · commit eacb6813cc75 · 2025-07-11T09:17:09.000-07:00
We are from NVIDIA and have been testing the moe kernels internally. We have seen some strange illegal memory access issue on release/3.4.x, but it's gone on main. After bisection we see this triton-lang#7182 fixes the issue. We think it's important to get this one in release branch so that we can make good use of 3.4 release once it's out. Co-authored-by: aeng-openai <aeng@openai.com>
diff --git a/python/triton/tools/tensor_descriptor.py b/python/triton/tools/tensor_descriptor.py
@@ -1,6 +1,8 @@
 from dataclasses import dataclass
 from typing import List, Any
 from triton._utils import validate_block_shape
+from torch._subclasses.fake_tensor import FakeTensor
+from torch._subclasses.functional_tensor import FunctionalTensor
 
 
 @dataclass
@@ -16,7 +18,8 @@ def __post_init__(self):
         assert len(self.block_shape) == rank, f"rank mismatch: {self}"
         assert rank > 0, "rank must not be zero"
         assert rank <= 5, "rank cannot be more than 5"
-        assert self.base.data_ptr() % 16 == 0, "base must be 16-byte aligned"
+        if not isinstance(self.base, (FakeTensor, FunctionalTensor)):
+            assert self.base.data_ptr() % 16 == 0, "base must be 16-byte aligned"
         validate_block_shape(self.block_shape)
         elem_bytes = self.base.dtype.itemsize
         for stride in self.strides[:-1]:
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs.py
@@ -1,6 +1,5 @@
 from dataclasses import dataclass
 import itertools
-import math
 import sys
 import torch
 import triton
@@ -121,20 +120,19 @@ def create_weight_descriptor(w_tensor: torch.Tensor, block_k: int, block_n: int,
                                                                transpose=transpose)
 
     @staticmethod
-    def create_block_scale_descriptor(mx_tensor: torch.Tensor, block_k: int, block_n: int, K: int, N: int,
-                                      mx_scale_stride_k: int, mx_scale_stride_n: int, n_expts_tot: int, batch_size: int,
-                                      expt_data: Optional[ExptData], swizzle_mx: bool,
-                                      transpose: bool) -> TensorDescriptor:
+    def create_block_scale_descriptor(mx_tensor: torch.Tensor, block_k: int, block_n: int, B: int, K: int, N: int,
+                                      mx_scale_stride_k: int, mx_scale_stride_n: int, swizzle_mx: bool,
+                                      transpose: Optional[bool]) -> TensorDescriptor:
         """Create a tensor descriptor for block scale factors"""
         MX_PACK_DIVISOR = 32
         MX_SCALE_BLOCK_K = block_k // MX_PACK_DIVISOR
         PackedK = (K + MX_PACK_DIVISOR - 1) // MX_PACK_DIVISOR
 
         if swizzle_mx:
-            num_expt_x_ncol = (n_expts_tot if expt_data is not None and len(expt_data.block_pid_map) > 0 else
-                               batch_size) * ((N + 127) // 128)
+            assert transpose is None
+            num_expt_x_ncol = B * triton.cdiv(N, 128)
             return TensorDescriptor(
-                base=mx_tensor, shape=[1, num_expt_x_ncol, (PackedK + 3) // 4, 2, 256],
+                base=mx_tensor, shape=[1, num_expt_x_ncol, triton.cdiv(PackedK, 4), 2, 256],
                 strides=[num_expt_x_ncol * mx_scale_stride_n, mx_scale_stride_n, mx_scale_stride_k, 256,
                          1], block_shape=[1, block_n // 128, MX_SCALE_BLOCK_K // 4, 2, 256])
         else:
@@ -151,35 +149,12 @@ def squeeze_after_dim(x, dim=2):
         return x.view(*new_shape)
 
     @staticmethod
-    def create_input_descriptor_gather(x_tensor: torch.Tensor, K: int, x_stride_1: int, x_stride_2: int,
-                                       block_k: int) -> TensorDescriptor:
-        """Create a tensor descriptor for input matrix X via TMA gather"""
-        x_desc = TensorDescriptorBuilder.squeeze_after_dim(x_tensor)
-        assert x_desc.ndim == 2, "TMA gather descriptor requires 2D input"
-        INT_MAX = 2147483647
-        return TensorDescriptor(base=x_desc, shape=[INT_MAX, K], strides=[x_stride_1, x_stride_2],
-                                block_shape=[1, block_k])
-
-    @staticmethod
-    def create_input_descriptor_load(x_tensor: torch.Tensor, K: int, x_stride_1: int, x_stride_2: int, block_m: int,
-                                     block_k: int) -> TensorDescriptor:
-        """Create a tensor descriptor for input matrix X via TMA"""
-        x_desc = TensorDescriptorBuilder.squeeze_after_dim(x_tensor)
-        assert x_desc.ndim in [2, 3], "LHS input TMA descriptor builder expects 2D or 3D input"
-        return TensorDescriptor(base=x_desc, shape=[x_desc.shape[0], K], strides=[x_stride_1, x_stride_2],
-                                block_shape=[block_m, block_k])
-
-    @staticmethod
-    def create_input_descriptor(x_tensor: torch.Tensor, K: int, x_stride_1: int, x_stride_2: int, block_k: int,
-                                block_m: int, use_gather_tma: bool, use_load_tma: bool) -> TensorDescriptor:
-        """Create a tensor descriptor for input matrix X based on TMA usage"""
-        if use_gather_tma:
-            return TensorDescriptorBuilder.create_input_descriptor_gather(x_tensor, K, x_stride_1, x_stride_2, block_k)
-        elif use_load_tma:
-            return TensorDescriptorBuilder.create_input_descriptor_load(x_tensor, K, x_stride_1, x_stride_2, block_m,
-                                                                        block_k)
-        else:
-            return x_tensor
+    def create_descriptor(x_tensor: torch.Tensor, block_m: int, block_k: int) -> TensorDescriptor:
+        """Create a tensor descriptor for matrix X via TMA"""
+        x_tensor = TensorDescriptorBuilder.squeeze_after_dim(x_tensor)
+        assert x_tensor.ndim in [2, 3], "TMA descriptor builder expects 2D or 3D input"
+        block_shape = [1] * (x_tensor.ndim - 2) + [block_m, block_k]
+        return TensorDescriptor.from_tensor(x_tensor, block_shape=block_shape)
 
 
 # ---------------------
@@ -590,66 +565,53 @@ def _create_tma_descriptors(
     mx_ctx: MicroscalingCtx,
     expt_data: ExptData,
     opt_flags: OptFlags,
-    batch_size: int,
+    B: int,
     K: int,
     N: int,
     mx_scale_stride_k: int,
     mx_scale_stride_n: int,
-    USE_GATHER_TMA: bool,
-    X_USE_LOAD_TMA: bool,
-    w_transpose: bool,
-    mx_transpose: bool,
+    HAS_GATHER: bool,
 ) -> Tuple[bool, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
     """Create and cache TMA descriptors for tensors."""
-    use_host_tma_descriptors = opt_flags.is_persistent and target_info.cuda_capability_geq(10, 0)
-
-    x_desc, w_desc = [None] * 2
-    descriptors = []
-    # The dense case currently uses on device descriptor updates
-    # so we bail out on using host descriptors in that case
-    if (use_host_tma_descriptors):
-        if USE_GATHER_TMA or X_USE_LOAD_TMA:
-            x_desc = TensorDescriptorBuilder.create_input_descriptor(
-                    x, K, x.stride(1), x.stride(2),
-                    opt_flags.block_k, opt_flags.block_m,
-                    USE_GATHER_TMA, X_USE_LOAD_TMA
-                )
-        descriptors.append(x_desc)
-        if (expt_data is not None and len(expt_data.block_pid_map) > 0):
-            w_desc = TensorDescriptorBuilder.create_weight_descriptor(
-                    w, opt_flags.block_k, opt_flags.block_n, w_transpose
-                )
-            is_microscaled_format = (mx_ctx.weight_scale is not None) and (w.dtype == torch.uint8)
-            if is_microscaled_format:
-                # Pad the inner shape to 128 for mxfp4 weights
-                # for mixed precision fp8 x mxfp4 compute
-                pad = 128
-                dim_to_pad = -1
-                old_size = w_desc.shape[dim_to_pad]
-                padded_size = math.ceil(old_size / pad) * pad
-                if padded_size != old_size:
-                    w_desc.shape = list(w_desc.shape)
-                    w_desc.shape[dim_to_pad] = padded_size
-        descriptors.append(w_desc)
-        # Optional MX scale descriptor
-        descriptors.append(None)
-        if mx_tensor is not None:
-            descriptors[-1] = TensorDescriptorBuilder.create_block_scale_descriptor(
-                    mx_tensor, opt_flags.block_k, opt_flags.block_n, K, N,
-                    mx_scale_stride_k, mx_scale_stride_n, routing_data.n_expts_tot,
-                    batch_size,
-                    expt_data, mx_ctx.swizzle_scale, mx_transpose
-                )
 
-    # TODO: Currently all or none, instead should support a mixture
-    # of host and device descriptors
-    if None in descriptors or len(descriptors) == 0:
-        descriptors = [x, w, mx_tensor]
-        use_host_tma_descriptors = False
-    if opt_flags.is_persistent:
-        opt_flags.target_kernel_kwargs["USE_HOST_TMA_DESCRIPTORS"] = use_host_tma_descriptors
+    x_tensor_or_desc, mx_desc_and_transpose = x, (None, False)
 
-    return use_host_tma_descriptors, *descriptors
+    if not HAS_GATHER:
+        x_tensor_or_desc = TensorDescriptorBuilder.create_descriptor(x, opt_flags.block_m, opt_flags.block_k)
+
+    w_transpose = w.stride(2) != 1
+    w_desc = TensorDescriptorBuilder.create_weight_descriptor(
+            w, opt_flags.block_k, opt_flags.block_n, w_transpose
+        )
+    w_desc_and_transpose = (w_desc, w_transpose)
+
+    is_microscaled_format = mx_ctx.weight_scale is not None and w.dtype == torch.uint8
+    if is_microscaled_format:
+        # Pad the inner shape to 128 for mxfp4 weights; TMA requires this when the compiler uses
+        # CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B.
+        # This technically makes the shape masking incorrect, but it's fine because:
+        #  - When the N dim is padded, the scales will be masked to 0.
+        #  - When the K dim is padded, the activations we perform tl.dot with will be masked to 0.
+        #    Note: the scales can't be relied on for zeroing in this case, because they apply to groups
+        #    of 32 elements in the K dimension.
+        pad = 128
+        dim_to_pad = -1
+        old_size = w_desc.shape[dim_to_pad]
+        padded_size = triton.cdiv(old_size, pad) * pad
+        if padded_size != old_size:
+            w_desc.shape = list(w_desc.shape)
+            w_desc.shape[dim_to_pad] = padded_size
+
+    if mx_tensor is not None:
+        mx_transpose = mx_scale_stride_n != 1 if mx_ctx.swizzle_scale is None else None
+        mx_desc = TensorDescriptorBuilder.create_block_scale_descriptor(
+                mx_tensor, opt_flags.block_k, opt_flags.block_n,
+                routing_data.n_expts_tot if expt_data is not None and len(expt_data.block_pid_map) > 0 else B, K, N,
+                mx_scale_stride_k, mx_scale_stride_n, mx_ctx.swizzle_scale, mx_transpose
+            )
+        mx_desc_and_transpose = (mx_desc, mx_transpose)
+
+    return x_tensor_or_desc, w_desc_and_transpose, mx_desc_and_transpose
 
 
 def matmul_ogs(x, w, bias,
@@ -754,41 +716,39 @@ def matmul_ogs(x, w, bias,
     expt_token_offs_raw = None if expt_data is None else expt_data.token_offs_raw
     expt_block_pid_map = None if expt_data is None else expt_data.block_pid_map[block_m]
 
-    HAS_TMA_GS = target_info.cuda_capability_geq(10, 0)
-    USE_GATHER_TMA = HAS_TMA_GS and gather_indx is not None
-    X_USE_LOAD_TMA = gather_indx is None and not USE_GATHER_TMA
-    _, x_tensor, w_tensor, mx_tensor = _create_tma_descriptors(
-        x=x, w=w,
-        mx_tensor=mx_ctx.weight_scale,
-        routing_data=routing_data,
-        mx_ctx=mx_ctx,
-        expt_data=expt_data,
-        opt_flags=opt_flags,
-        batch_size=batch_size,
-        K=K,
-        N=N,
-        mx_scale_stride_k=mx_scale_stride_k,
-        mx_scale_stride_n=mx_scale_stride_n,
-        USE_GATHER_TMA=USE_GATHER_TMA,
-        X_USE_LOAD_TMA=X_USE_LOAD_TMA,
-        w_transpose=w.stride(2) != 1,
-        mx_transpose=mx_scale_stride_n != 1,
-    )
+    if opt_flags.is_persistent:
+        x_tensor, w_tensor_and_transpose, mx_tensor_and_tranpose = _create_tma_descriptors(
+            x=x, w=w, mx_tensor=mx_ctx.weight_scale,
+            routing_data=routing_data,
+            mx_ctx=mx_ctx,
+            expt_data=expt_data,
+            opt_flags=opt_flags,
+            B=batch_size,
+            K=K,
+            N=N,
+            mx_scale_stride_k=mx_scale_stride_k,
+            mx_scale_stride_n=mx_scale_stride_n,
+            HAS_GATHER=gather_indx is not None,
+        )
+        w_tensor, w_tma_transpose = w_tensor_and_transpose
+        mx_tensor, mx_tma_transpose = mx_tensor_and_tranpose
+    else:
+        x_tensor = x
+        w_tensor, w_tma_transpose = w, False
+        mx_tensor, mx_tma_transpose = mx_ctx.weight_scale, False
     if isinstance(x_tensor, torch.Tensor):
         x_tensor = flex.lhs_data.reinterpret(x)
     if isinstance(w_tensor, torch.Tensor):
         w_tensor = flex.rhs_data.reinterpret(w)
     (kernels._p_matmul_ogs if opt_flags.is_persistent else kernels._matmul_ogs)[(n_cta,)](
                    flex.out_data.reinterpret(memory["output"]),
-                   flex.out_data.reinterpret(out0), *out0.stride(),
-                   *out0_flex,
+                   flex.out_data.reinterpret(out0), *out0.stride(), *out0_flex,
                    x_tensor, x.stride(0), x.stride(1), x.stride(2),
                    flex.lhs_data.scale,
-                   w_tensor, w.stride(0), w.stride(1), w.stride(2), w.stride(2) != 1,
+                   w_tensor, w.stride(0), w.stride(1), w.stride(2), w_tma_transpose,
                    flex.rhs_data.scale,
-                   mx_tensor, mx_scale_stride_e, mx_scale_stride_k, mx_scale_stride_n, mx_scale_stride_n != 1,
+                   mx_tensor, mx_scale_stride_e, mx_scale_stride_k, mx_scale_stride_n, mx_tma_transpose,
                    bias, bias_stride,
-                   x.shape[1],
                    x.shape[1] if routing_data.expt_hist is None else None,
                    N, K,
                    betas, gammas,
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/_matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/_matmul_ogs.py
@@ -39,7 +39,7 @@ def _matmul_ogs(
              WScale,
              MxScale, stride_mx_e, stride_mx_k, stride_mx_n, MX_TRANSPOSE: tl.constexpr,
              B, stride_b_e, # Bias
-             NRows, M, N, K, # shapes
+             M, N, K, # shapes
              # expt data
              Betas, Gammas,
              GatherIndx,
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/_p_matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/_p_matmul_ogs.py