[mxfp] handle w_scale w/o swizzle correctly (#8652)

jongsoo-openai · web-flow · commit f81125b8c840 · 2025-11-07T09:05:52.000-08:00
In practice, we can't support w_scale with column-wise strided layout, since we will divide the reduction dim by 32 then it needs to be a multiple of 16 for TMA. So, we disable TMA (and persistent kernel) for this case. Added a test case for this. Before this PR the test case led to ``` E triton.compiler.errors.CompilationError: at 227:26: E w_scales = w_scales.reshape((w_scales.shape[1], w_scales.shape[2] * w_scales.shape[-2] * w_scales.shape[-1])) E w_scales = unswizzle_mx_scale_bw(w_scales) E else: E w_scales = WMxScale.load([expt_id, off_k_mx, off_n]) E w_scales = tl.reshape(w_scales, *w_scales.shape[1:]).T E E # --- update accumulator --- E if is_w_microscaled: E if SWAP_XW: E acc = tl.dot_scaled(w.T, w_scales, w_format, x.T, x_scales, x_format, acc=acc, fast_math=True) E else: E acc = tl.dot_scaled(x, x_scales, x_format, w, w_scales, w_format, acc=acc, fast_math=True) E ^ E rhs_scale must be a tensor of shape [256, 4]. Got ['4', '256'] ``` The way ``make_dense_tma`` was checking if it was called for scale was also ambiguous. Previously, it assumed for ``StridedLayout`` it's not scale which is wrong. # New contributor declaration - [x] I am not making a trivial change, such as fixing a typo in a comment. - [x] I have written a PR description following these [rules](https://cbea.ms/git-commit/#why-not-how). - [x] I have run `pre-commit run --from-ref origin/main --to-ref HEAD`. - Select one of the following. - [x] I have added tests. - `/test` for `lit` tests - `/unittest` for C++ tests - `/python/test` for end-to-end tests - [ ] This PR does not need a test because `FILL THIS IN`. - Select one of the following. - [x] I have not added any `lit` tests. - [ ] The `lit` tests I have added follow these [best practices](https://mlir.llvm.org/getting_started/TestingGuide/#filecheck-best-practices), including the "tests should be minimal" section. (Usually running Python code and using the instructions it generates is not minimal.)
diff --git a/python/triton_kernels/tests/test_matmul.py b/python/triton_kernels/tests/test_matmul.py
@@ -233,6 +233,7 @@ class Case:
             Case(16, 16, 1000, "batched", "float8_e5m2", "float8_e5m2", 5, 1, split_k=None),
             Case(16, 16, 2048, "batched", "float8_e5m2", "float8_e5m2", 6, 1, split_k=5),
             # mx types:
+            Case(1, 1024, 1024, "plain", "bfloat16", "mxfloat8_e4m3fn", 1, 1),
             Case(16, 256, 256, "plain", "bfloat16", "mxfloat4_e2m1", 1, 1),
             Case(16, 256, 256, "plain", "bfloat16", "mxfloat4_e2m1", 1, 1, hbm_swizzling=True),
             Case(16, 256, 256, "plain", "bfloat16", "mxfloat4_e2m1", 1, 1, hbm_swizzling=True, epilogue_subtile=4),
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs.py
@@ -416,6 +416,12 @@ def matmul_ogs(x, w, bias,
         # unaligned access.
         (inner_routing_data is None or w.stride(-1) == 1 or inner_routing_data.w_is_padded)
     )
+    if w_scale is not None and isinstance(w_scale.storage.layout, StridedLayout) and w_scale.storage.data.stride()[-1] != 1:
+        # In this case, we need to transpose w_scale. Then the reduction dim
+        # becomes the last dim that will be divided by 32. This to be a multiple
+        # of 16 to be TMA-compliant requires block_k to be a multiple of 512,
+        # which is too big.
+        can_use_tma = False
     has_gather_tma = has_gather and target_info.has_tma_gather()
     # hopper w/ mxfp4 doesn't support TMA
     can_use_tma = can_use_tma and (torch.cuda.get_device_capability()[0] > 9 or bitwidth(w.dtype) != 4)
@@ -526,14 +532,23 @@ def matmul_ogs(x, w, bias,
     w_tensor_or_tma = w_storage.make_tma([1, opt_flags.block_k, opt_flags.block_n], "dense") if w_has_tma else w_storage.data
     # create tma descriptor for w_scale
     w_scale_has_tma = opt_flags.is_persistent and w_scale is not None
+    # When stride(-2) == stride(-1) == 1, it's ambiguous whether W is transposed
+    # (i.e. col-wise). Since this matters when w_has_mx is True and w_transpose
+    # is True the fast code path, stride(-2) == 1 takes precedence, e.g., vs.
+    # w_transpose = w_storage.data.stride()[-1] != 1
     w_transpose = w_storage.data.stride()[-2] == 1
     if w_scale_has_tma:
         w_scale_storage = w_scale.storage
-        w_scale_tma_block_size = [opt_flags.block_n, opt_flags.block_k] if w_transpose else [opt_flags.block_k, opt_flags.block_n]
+        scale_block_k = opt_flags.block_k // int(MXFP_BLOCK_SIZE)
+        # cancel out the transpose done inside make_tma since
+        # BlackwellMXScaleLayout.swizzle_block_shape expects block_shape[1] is
+        # the reduction dimension.
+        w_scale_tma_block_size = [opt_flags.block_n, scale_block_k] if w_transpose and w_scale.storage.layout.name == "BLACKWELL_SCALE" else [scale_block_k, opt_flags.block_n]
         if isinstance(w_scale.storage.layout, StridedLayout):
+            assert w_scale_storage.data.stride()[-1] == 1, "w_scale should be contiguous with StridedLayout"
             w_scale_storage = _canonicalize_storage(w_scale.storage, 3, None)
             w_scale_tma_block_size = [1] + w_scale_tma_block_size
-        w_scale_tensor_or_tma = w_scale_storage.make_tma(w_scale_tma_block_size, "dense")
+        w_scale_tensor_or_tma = w_scale_storage.make_tma(w_scale_tma_block_size, "dense", is_scale=True)
     else:
         w_scale_tensor_or_tma = w_scale
     # canonicalize strides
@@ -546,10 +561,6 @@ def matmul_ogs(x, w, bias,
     out_matmul_scale_strides = (0, ) * (4 - len(out_matmul_scale_strides)) + out_matmul_scale_strides
     # launch kernel
     kernels = specializations.get(epilogue=epilogue.specs, activation=matmul_fused_activation.specs)
-    # When stride(-2) == stride(-1) == 1, it's ambiguous whether W is transposed
-    # (i.e. col-wise). Since this matters when w_has_mx is True and w_transpose
-    # is True the fast code path, stride(-2) == 1 takes precedence, e.g., vs.
-    # w_transpose = w_storage.data.stride()[-1] != 1
     if gather_indx is not None:
         gather_src_indx = torch.div(gather_indx.src_indx, routing_data.n_expts_act, rounding_mode='trunc')
     fused_comm_kwargs = {
diff --git a/python/triton_kernels/triton_kernels/tensor.py b/python/triton_kernels/triton_kernels/tensor.py
@@ -47,28 +47,28 @@ def is_tma_compliant(self):
         compliant = [strides[i] * bitwidth % 128 == 0 for i in range(ndim) if i != major_dim]
         return all(compliant)
 
-    def make_dense_tma(self, block_shape):
+    def make_dense_tma(self, block_shape, is_scale):
         strides = list(self.data.stride())
         shape = list(self.data.shape)
         transpose = strides[-1] != 1
         if transpose:
+            # Need to transpose since tensor descriptor expects strides except for the last dimension 16-byte aligned
+            # https://github.com/triton-lang/triton/blob/e5e0081db3335e7755e2c67c784cb1c92769812f/python/triton/tools/tensor_descriptor.py#L26
             block_shape = block_shape[:-2] + [block_shape[-1], block_shape[-2]]
             shape = shape[:-2] + [shape[-1], shape[-2]]
             strides = strides[:-2] + [strides[-1], strides[-2]]
-        if self.data.dtype == torch.uint8 and (self.layout.name is None or "_SCALE" not in self.layout.name):
+        if self.data.dtype == torch.uint8 and not is_scale:
             indx = strides.index(1)
             block_shape[indx] = block_shape[indx] // 2
-            if isinstance(self.layout, BlackwellMXValueLayout):
-                if shape[-1] % 128 != 0:
-                    raise ValueError(
-                        "inner shape need to be multiple of 128 for mxfp4 (CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B) TMAs."
-                    )
+            if isinstance(self.layout, BlackwellMXValueLayout) and shape[-1] % 128 != 0:
+                raise ValueError(
+                    "inner shape need to be multiple of 128 for mxfp4 (CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B) TMAs.")
         block_shape = self.layout.swizzle_block_shape(block_shape)
         return TensorDescriptor(self.data, shape, strides, block_shape)
 
-    def make_tma(self, block_shape, mode):
+    def make_tma(self, block_shape, mode, is_scale=False):
         if mode in ["dense", "gather", "scatter"]:
-            return self.make_dense_tma(block_shape)
+            return self.make_dense_tma(block_shape, is_scale)
         assert mode == "ragged"
         ragged_dim = len(self.data.shape) - 2
         return create_ragged_descriptor(self.data, block_shape, ragged_dim=ragged_dim)
diff --git a/python/triton_kernels/triton_kernels/tensor_details/layout_details/blackwell_scale.py b/python/triton_kernels/triton_kernels/tensor_details/layout_details/blackwell_scale.py
@@ -34,7 +34,7 @@ def swizzle_data(self, data):
         data = data.reshape(self.B, self.N_pad // self.ALIGN_N, self.ALIGN_N // 32, 32, self.K_pad // self.SWIZZLE_K,
                             self.SWIZZLE_K)
         data = data.transpose(2, 4).contiguous()
-        data = data.view(1, self.B * self.N_pad // 128, self.K_pad // 4, 2, 256)
+        data = data.view(1, self.B * self.N_pad // 128, self.K_pad // self.SWIZZLE_K, 2, 256)
         return data
 
     def unswizzle_data(self, data):
@@ -46,10 +46,8 @@ def unswizzle_data(self, data):
         return data[..., :self.K, :self.N]
 
     def swizzle_block_shape(self, block_shape):
-        MX_PACK_DIVISOR = 32
-        MX_SCALE_BLOCK_K = block_shape[1] // MX_PACK_DIVISOR
         assert block_shape[0] >= 128, f"{block_shape[0]=} must be >= 128"
-        return [1, block_shape[0] // 128, MX_SCALE_BLOCK_K // 4, 2, 256]
+        return [1, block_shape[0] // 128, block_shape[1] // 4, 2, 256]
 
 
 @triton.jit