[mxfp] support EXPT_IS_INNER for MX (#8385)

jongsoo-openai · web-flow · commit 7578e3ec3bba · 2025-10-25T01:30:55.000Z
# New contributor declaration - [x] I am not making a trivial change, such as fixing a typo in a comment. - [x] I have written a PR description following these [rules](https://cbea.ms/git-commit/#why-not-how). - [x] I have run `pre-commit run --from-ref origin/main --to-ref HEAD`. - Select one of the following. - [x] I have added tests. - `/test` for `lit` tests - `/unittest` for C++ tests - `/python/test` for end-to-end tests - [ ] This PR does not need a test because `FILL THIS IN`. - Select one of the following. - [x] I have not added any `lit` tests. - [ ] The `lit` tests I have added follow these [best practices](https://mlir.llvm.org/getting_started/TestingGuide/#filecheck-best-practices), including the "tests should be minimal" section. (Usually running Python code and using the instructions it generates is not minimal.)
diff --git a/python/triton_kernels/tests/test_matmul.py b/python/triton_kernels/tests/test_matmul.py
@@ -246,7 +246,8 @@ class Case:
             Case(300, 400, 400, "ragged", "bfloat16", "mxfloat8_e4m3fn", 8, 4, hbm_swizzling=True),
             Case(300, 400, 400, "batched", "bfloat16", "mxfloat8_e5m2", 32, 4),
             Case(1000, 700, 2, "batched", "bfloat16", "mxfloat4_e2m1", 8, 2),
-            Case(1, 2880, 2880, "ragged", "bfloat16", "mxfloat4_e2m1", 128, 4),
+            # Cover (N or K) % 128 == 64 (https://github.com/triton-lang/triton/pull/7203)
+            Case(1, 1472, 1472, "ragged", "bfloat16", "mxfloat4_e2m1", 128, 4),
             Case(16, 256, 256, "ragged", "float8_e5m2", "mxfloat4_e2m1", 128, 4, hbm_swizzling=True),
             Case(1000, 704, 832, "batched", "float8_e5m2", "mxfloat4_e2m1", 3, 1, hbm_swizzling=True),
             Case(1000, 704, 832, "batched", "float8_e5m2", "mxfloat4_e2m1", 3, 1, hbm_swizzling=True),
@@ -318,6 +319,24 @@ def test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, inner_expt_o
             n_expts_act, mode, act_dtype_str, weight_dtype_str, block_m, hbm_swizzling, colmajor_mxfp_weight, epilogue_subtile,
             x_transpose, w_transpose, y_transpose,
             device, opt_flags_scope):
+    # We catch and re-invoke pytest.skip(), because otherwise pytest may hold a reference to
+    # the frame that called pytest.skip, including all the tensors, leading to OOM.
+    skip_message = None
+    try:
+        _test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, inner_expt_opt, has_y_gammas, is_persistent, n_expts_tot,
+                 n_expts_act, mode, act_dtype_str, weight_dtype_str, block_m, hbm_swizzling, colmajor_mxfp_weight, epilogue_subtile,
+                 x_transpose, w_transpose, y_transpose,
+                 device, opt_flags_scope)
+    except pytest.skip.Exception as e:
+        skip_message = str(e)
+
+    if skip_message is not None:
+        pytest.skip(skip_message)
+
+def _test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, inner_expt_opt, has_y_gammas, is_persistent, n_expts_tot,
+            n_expts_act, mode, act_dtype_str, weight_dtype_str, block_m, hbm_swizzling, colmajor_mxfp_weight, epilogue_subtile,
+            x_transpose, w_transpose, y_transpose,
+            device, opt_flags_scope):
     # TODO: remove when Triton FP8 supports proper RTNE
     if is_cuda():
         if "float8" in weight_dtype_str and torch.cuda.get_device_capability()[0] < 9:
@@ -327,8 +346,6 @@ def test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, inner_expt_o
         if weight_dtype_str.startswith("mx"):
             if "float8" in act_dtype_str and torch.cuda.get_device_capability()[0] < 10:
                 pytest.skip("float8 x mx not supported with cuda capability < 10")
-        if n == 2880 and k == 2880 and torch.cuda.get_device_capability()[0] < 9:
-            pytest.skip("Not enough memory on A100")
 
     elif is_hip():
         if "float8" in act_dtype_str and "mx" in weight_dtype_str and not is_hip_cdna4():
@@ -366,8 +383,21 @@ def test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, inner_expt_o
                 pytest.skip("Hopper swizzling acts on a 64x64 tile (4x1 mma tiles).")
 
     expt_is_inner = (inner_expt_opt is not None)
-    if expt_is_inner and (mode != "ragged" or "mx" in act_dtype_str or "mx" in weight_dtype_str):
-        pytest.skip("Not supported yet")
+    if expt_is_inner:
+        if mode != "ragged":
+            pytest.skip("inner_expt_opt only meaningful with ragged")
+        if "mx" in act_dtype_str and inner_expt_opt != "pad_x":
+            pytest.skip("inner_expt_opt and act mx only supported with pad_x")
+        if "mx" in weight_dtype_str:
+            if inner_expt_opt != "pad_w":
+                pytest.skip("inner_expt_opt and weight mx only supported with pad_w")
+            if is_persistent and not hbm_swizzling:
+                pytest.skip("FIXME: Fatal Python error: Aborted")
+            if is_hip():
+                if act_dtype_str == "bfloat16":
+                    pytest.skip("FIXME: failed to translate module to LLVM IR")
+                if hbm_swizzling:
+                    pytest.skip("NYI: nner_expt_opt and HBM swizzling")
 
     # launch metadata for batched / mx types may not work yet.
     torch.manual_seed(0)
@@ -399,6 +429,7 @@ def test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, inner_expt_o
     opt_flags.update_opt_flags_constraints(constraints)
 
     weight_mxfp = weight_dtype_str.startswith("mx")
+    weight_mxfp4 = weight_mxfp and "float4" in weight_dtype_str
     if weight_mxfp:
         weight_dtype_str = weight_dtype_str[2:]
     act_mxfp8 = act_dtype_str.startswith("mx")
@@ -422,6 +453,13 @@ def test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, inner_expt_o
         rdata = gindx = sindx = None
 
     padding_block_k = 32
+    if hbm_swizzling:
+        if torch.cuda.get_device_capability()[0] >= 10:
+            # Blackwell scale swizzling constraint
+            # https://github.com/triton-lang/triton/blob/814b862166c756d9f33238844f4ac047e0243388/python/triton_kernels/triton_kernels/tensor_details/layout_details/blackwell_scale.py#L45
+            padding_block_k = 128
+        elif not is_persistent:
+            padding_block_k = 64
     x_tri, w_tri, bias_tri, gs0_tri, gs1_tri = init_compute_data(m, n, k, rdata, gindx, sindx, n_expts_tot, n_expts_act,
                                                                  mode, torch.bfloat16 if act_mxfp8 else act_dtype,  #
                                                                  torch.bfloat16 if weight_mxfp else weight_dtype,
@@ -457,7 +495,7 @@ def test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, inner_expt_o
         # compute layouts
         w_layout, w_layout_opts = layout.StridedLayout, dict()
         w_scale_layout, w_scale_layout_opts = layout.StridedLayout, dict()
-        if hbm_swizzling and "float4" in weight_dtype_str:
+        if hbm_swizzling and weight_mxfp4:
             w_layout, w_layout_opts = layout.make_default_matmul_mxfp4_w_layout(mx_axis=mx_axis)
             w_scale_layout, w_scale_layout_opts = layout.make_default_matmul_mxfp4_w_scale_layout(
                 mx_axis=mx_axis, num_warps=8)
@@ -466,7 +504,7 @@ def test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, inner_expt_o
         if colmajor_mxfp_weight:
             w_tri, w_scale_tri = downcast_to_mxfp(w_tri, weight_dtype, axis=mx_axis)
             w_ref = upcast_from_mxfp(w_tri, w_scale_tri, torch.bfloat16, axis=mx_axis)
-            w_tri_dtype = FP4 if "float4" in weight_dtype_str else weight_dtype
+            w_tri_dtype = FP4 if weight_mxfp4 else weight_dtype
             w_tri = wrap_torch_tensor(w_tri, w_tri_dtype)
             w_scale_tri = wrap_torch_tensor(w_scale_tri)
             # convert layouts
@@ -568,8 +606,8 @@ def _pad_and_block(x: torch.Tensor) -> torch.Tensor:
         tri_y = matmul_ogs(x_tri, w_tri, bias_tri, rdata, gindx, sindx, precision_opt,
                            gammas=gs1_ref, epilogue=epilogue, y=y_tri_in,
                            inner_routing_data=inner_routing_data)
-    except (opt_flags.InapplicableConstraint, NotImplementedError):
-        pytest.skip("inapplicable opt_flags constraint")
+    except (opt_flags.InapplicableConstraint, NotImplementedError) as e:
+        pytest.skip(f"inapplicable opt_flags constraint {e}")
     if y_tri_in is not None:
         assert tri_y.data_ptr() == y_tri_in.data_ptr()
         assert tri_y.shape == y_tri_in.shape
@@ -602,7 +640,7 @@ def scale(val, scal):
         ref_y = upcast_from_mxfp_torch(ref_y_quant, ref_y_scale, target_dtype=ref_y.dtype, axis=-1)
         maxtol = 4e-1
         rmstol = 4e-2
-    elif weight_mxfp and "float4_e2m1" in weight_dtype_str:
+    elif weight_mxfp4:
         if act_is_float8:
             maxtol = 8e-2
         else:
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs.py
@@ -646,9 +646,17 @@ def matmul_ogs(x, w, bias,
     w_has_tma = opt_flags.is_persistent
     w_tensor_or_tma = w_storage.make_tma([1, opt_flags.block_k, opt_flags.block_n], "dense") if w_has_tma else w_storage.data
     # create tma descriptor for w_scale
-    w_scale_tensor_or_tma = w_scale
     w_scale_has_tma = opt_flags.is_persistent and w_scale is not None
-    w_scale_tensor_or_tma =  w_scale.storage.make_tma([opt_flags.block_n, opt_flags.block_k], "dense") if w_scale_has_tma else w_scale
+    w_transpose = w_storage.data.stride()[-2] == 1
+    if w_scale_has_tma:
+        w_scale_storage = w_scale.storage
+        w_scale_tma_block_size = [opt_flags.block_n, opt_flags.block_k] if w_transpose else [opt_flags.block_k, opt_flags.block_n]
+        if isinstance(w_scale.storage.layout, StridedLayout):
+            w_scale_storage = _canonicalize_storage(w_scale.storage, 3, None)
+            w_scale_tma_block_size = [1] + w_scale_tma_block_size
+        w_scale_tensor_or_tma = w_scale_storage.make_tma(w_scale_tma_block_size, "dense")
+    else:
+        w_scale_tensor_or_tma = w_scale
     # canonicalize strides
     x_strides = [0]*(3 - x_storage.data.ndim) + list(x_storage.data.stride())
     x_scale_strides = x_scale.stride() if x_has_mx else (None, None, None)
@@ -663,7 +671,6 @@ def matmul_ogs(x, w, bias,
     # (i.e. col-wise). Since this matters when w_has_mx is True and w_transpose
     # is True the fast code path, stride(-2) == 1 takes precedence, e.g., vs.
     # w_transpose = w_storage.data.stride()[-1] != 1
-    w_transpose = w_storage.data.stride()[-2] == 1
     fused_comm_kwargs = {
         "pYPtrs": fused_comm.out_handles,
         "ScatterShardIndx": fused_comm.scatter_shard_indx,
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/_common.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/_common.py
@@ -98,13 +98,14 @@ def _load_tile_attrs(
         tl.static_assert(M is not None)
         expt_id, pid_z, pid_z_out, start_m, block_id, eM = 0, 0, pid_e, 0, pid_m, M
         k_tiles = tl.cdiv(tl.load(ExptHist + pid_e), BLOCK_K)
-        padded_start_off = tl.load(ExptTileOffs + pid_e) * BLOCK_K
+        padded_start_off_raw = tl.load(ExptTileOffs + pid_e)
+        padded_start_off = padded_start_off_raw * BLOCK_K
         unpadded_start_off = tl.load(ExptOffs + pid_e)
         off_k_x = padded_start_off if X_IS_PADDED else unpadded_start_off
         # K_W is only used for non-TMA kernel (W bound is handled by TMA on TMA kernel).
         if W_IS_PADDED:
-            off_k_w = padded_start_off
-            K_W = tl.load(ExptTileOffs + pid_e + 1) * BLOCK_K
+            off_k_w = padded_start_off_raw * PACKED_BLOCK_K_W
+            K_W = tl.load(ExptTileOffs + pid_e + 1) * PACKED_BLOCK_K_W
         else:
             off_k_w = unpadded_start_off
             K_W = tl.load(ExptOffs + pid_e + 1)
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/_matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/_matmul_ogs.py
@@ -131,7 +131,7 @@ def _matmul_ogs(
         tl.static_assert(w_type == tl.uint8 or (w_type == tl.float8e4nv or w_type == tl.float8e5),
                          "mx_weight_ptr must be uint8 or fp8")
         tl.static_assert(WMxScale.dtype.element_ty == tl.uint8, "mx_scale_ptr must be uint8")
-        tl.static_assert(BLOCK_K % MX_PACK_DIVISOR == 0, "BLOCK_K must be a multiple of MX_PACK_DIVISOR")
+        tl.static_assert(BLOCK_K % MX_PACK_DIVISOR == 0, f"{BLOCK_K=} must be a multiple of {MX_PACK_DIVISOR=}")
         tl.static_assert(SWIZZLE_MX_VALUE == "HOPPER_VALUE" or SWIZZLE_MX_VALUE is None, "Only Hopper swizzling is supported for values")
 
         # TODO: refactor if/else when triton front end improves
@@ -247,7 +247,6 @@ def _matmul_ogs(
 
     # TODO: refactor if/else when triton front end improves
     if is_w_microscaled:
-        tl.static_assert(not EXPT_IS_INNER, "Not supported yet")
         WMxScale += expt_id * stride_w_mx_e
 
         if SWIZZLE_MX_SCALE == "BLACKWELL_SCALE":
@@ -281,7 +280,8 @@ def _matmul_ogs(
         offs_n_scale = (pid_n * SCALE_BLOCK_N + tl.arange(0, SCALE_BLOCK_N)) % N
         offs_n_scale = tl.max_contiguous(tl.multiple_of(offs_n_scale, SCALE_BLOCK_N), SCALE_BLOCK_N)
         # K dimension must be the last dimension for the scales
-        offs_k_scale = PACKED_MX_BLOCK * pid_k + tl.arange(0, PACKED_MX_BLOCK)
+        tl.static_assert(not EXPT_IS_INNER or W_IS_PADDED)
+        offs_k_scale = off_k_w // PACKED_BLOCK_K_W * PACKED_MX_BLOCK + tl.arange(0, PACKED_MX_BLOCK)
         WMxScalePtrs = WMxScale + offs_k_scale.to(index_type)[None, :] * stride_scale_k + offs_n_scale.to(index_type)[:, None] * stride_w_mx_n
     else:
         WMxScalePtrs = None
@@ -295,7 +295,7 @@ def _matmul_ogs(
         XMxScale += start_z.to(index_type) * stride_x_mx_z
         if GatherIndx is None:
             XMxScale += start_m * stride_x_mx_m
-        offs_x_k_scale = MX_SCALE_BLOCK_K * pid_k + tl.arange(0, MX_SCALE_BLOCK_K)
+        offs_x_k_scale = off_k_x // MXFP_BLOCK_SIZE + tl.arange(0, MX_SCALE_BLOCK_K)
         XMxScalePtrs = XMxScale + offs_x_m.to(index_type)[:, None] * stride_x_mx_m + offs_x_k_scale.to(index_type)[None, :] * stride_x_mx_k
     else:
         XMxScalePtrs = None
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/_p_matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/_p_matmul_ogs.py
@@ -125,7 +125,6 @@ def _p_matmul_ogs(
         tl.static_assert(get_dtype(WMxScale) == tl.uint8, "mx_scale_ptr must be uint8")
         tl.static_assert(BLOCK_K % MX_PACK_DIVISOR == 0, "BLOCK_K must be a multiple of MX_PACK_DIVISOR")
         tl.static_assert(SWIZZLE_MX_SCALE == "BLACKWELL_SCALE" or SWIZZLE_MX_SCALE is None, "Only Blackwell swizzling is supported for scales")
-        tl.static_assert(not EXPT_IS_INNER, "Not supported yet")
 
         # We have pack 2 fp4 values in a byte
         W_PACK_DIVISOR: tl.constexpr = 2 if w_type == tl.uint8 else 1
@@ -249,7 +248,7 @@ def _p_matmul_ogs(
             XMxScalePtrs = XMxScale + start_z.to(index_type) * stride_x_mx_z
             if GatherIndx is None:
                 XMxScalePtrs += start_m * stride_x_mx_m
-            offs_k_scale = MX_SCALE_BLOCK_K * pid_k + tl.arange(0, MX_SCALE_BLOCK_K)
+            offs_k_scale = off_k_x0 // MXFP_BLOCK_SIZE + tl.arange(0, MX_SCALE_BLOCK_K)
             XMxScalePtrs += (offs_x_m if USE_GATHER_TMA else offs_m).to(index_type)[:, None] * stride_x_mx_m
             XMxScalePtrs += offs_k_scale.to(index_type)[None, :] * stride_x_mx_k
         else:
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags.py
@@ -229,14 +229,13 @@ def make_default_opt_flags_nvidia(
             is_persistent = False
     block_n = block_n_tma if is_persistent else block_n
     # block k
-    if constraints.get("block_k", None) is not None:
-        block_k = constraints["block_k"]
-    else:
-        block_k = opt_flags_nvidia.compute_block_k(m, k, is_persistent, lhs_dtype, rhs_dtype, precision_config, has_y_acc_in)
+    block_k = opt_flags_nvidia.compute_block_k(m, k, is_persistent, lhs_dtype, rhs_dtype, precision_config, has_y_acc_in)
     if block_n == 256 and block_k == 128 and block_m <= 64 and is_persistent and rhs_dtype == FP4 and k >= 4096 and tokens_per_expt > 1:
         # Swap block_n and block_k for mxfp4 weights so that block_k is a full cacheline, so long as K is sufficiently large.
         # TODO: swizzle the HBM layout of the weights instead
         block_n, block_k = block_k, block_n
+    if constraints.get("block_k", None) is not None:
+        block_k = constraints["block_k"]
     # split_k
     if constraints.get("max_allowable_mn", 0) > 0 and constraints.get("split_k") is not None:
         split_k = max_allowable_mn(constraints["max_allowable_mn"], m, n, constraints.get("split_k"))
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags_details/opt_flags_nvidia.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags_details/opt_flags_nvidia.py
@@ -1,9 +1,9 @@
 import torch
 import triton
 from triton_kernels import target_info
-from triton_kernels.tensor import get_layout, bitwidth, FP4
-from triton_kernels.tensor_details.layout import HopperAmpereMXScaleLayout
 from triton_kernels.numerics_details.mxfp_details._downcast_to_mxfp import MXFP_BLOCK_SIZE
+from triton_kernels.tensor import FP4, bitwidth, get_layout
+from triton_kernels.tensor_details.layout import HopperAmpereMXScaleLayout
 
 
 def compute_grid_size(routing_data, batch_size, m, n, block_m, block_n):
@@ -18,8 +18,11 @@ def compute_grid_size(routing_data, batch_size, m, n, block_m, block_n):
 def compute_block_n(n: int, arch, precision_config):
     # block_n:
     layout = get_layout(precision_config.weight_scale)
-    if isinstance(layout, HopperAmpereMXScaleLayout) and layout.num_warps == 4:
-        return 128, 128
+    if isinstance(layout, HopperAmpereMXScaleLayout):
+        if layout.num_warps in [4, 8]:
+            # https://github.com/triton-lang/triton/blob/814b862166c756d9f33238844f4ac047e0243388/python/triton_kernels/triton_kernels/matmul_ogs_details/_matmul_ogs.py#L265
+            block_n = 2 * layout.num_warps * 2 * 8
+            return block_n, block_n
     elif precision_config.max_num_imprecise_acc is None and n > 128:
         return 256, 256
     else:
diff --git a/python/triton_kernels/triton_kernels/tensor.py b/python/triton_kernels/triton_kernels/tensor.py
@@ -2,12 +2,13 @@
 from typing import Type
 
 import torch
-from triton.tools.tensor_descriptor import TensorDescriptor
 from triton.tools.ragged_tma import create_ragged_descriptor
+from triton.tools.tensor_descriptor import TensorDescriptor
+
 from .target_info import cuda_capability_geq
-from .tensor_details.layout import Layout, StridedLayout
-from .tensor_details import ragged_tensor as ragged_tensor_details
 from .tensor_details import bitmatrix as bitmatrix_details
+from .tensor_details import ragged_tensor as ragged_tensor_details
+from .tensor_details.layout import BlackwellMXValueLayout, Layout, StridedLayout
 from .tensor_details.ragged_tensor import RaggedTensorMetadata
 
 
@@ -46,26 +47,28 @@ def is_tma_compliant(self):
         compliant = [strides[i] * bitwidth % 128 == 0 for i in range(ndim) if i != major_dim]
         return all(compliant)
 
-    def make_dense_tma(self, block_shape, transpose=False):
+    def make_dense_tma(self, block_shape):
         strides = list(self.data.stride())
         shape = list(self.data.shape)
-        transpose = self.data.stride()[-1] != 1
+        transpose = strides[-1] != 1
         if transpose:
             block_shape = block_shape[:-2] + [block_shape[-1], block_shape[-2]]
             shape = shape[:-2] + [shape[-1], shape[-2]]
             strides = strides[:-2] + [strides[-1], strides[-2]]
-        if self.data.dtype == torch.uint8 and self.layout.name == "BLACKWELL_VALUE":
+        if self.data.dtype == torch.uint8 and (self.layout.name is None or "_SCALE" not in self.layout.name):
             indx = strides.index(1)
             block_shape[indx] = block_shape[indx] // 2
-            if shape[-1] % 128 != 0:
-                raise ValueError("inner shape need to be multiple of 128 for "
-                                 "mxfp4 (CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B) TMAs.")
+            if isinstance(self.layout, BlackwellMXValueLayout):
+                if shape[-1] % 128 != 0:
+                    raise ValueError(
+                        "inner shape need to be multiple of 128 for mxfp4 (CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B) TMAs."
+                    )
         block_shape = self.layout.swizzle_block_shape(block_shape)
         return TensorDescriptor(self.data, shape, strides, block_shape)
 
-    def make_tma(self, block_shape, mode, transpose=False):
+    def make_tma(self, block_shape, mode):
         if mode in ["dense", "gather", "scatter"]:
-            return self.make_dense_tma(block_shape, transpose)
+            return self.make_dense_tma(block_shape)
         assert mode == "ragged"
         ragged_dim = len(self.data.shape) - 2
         return create_ragged_descriptor(self.data, block_shape, ragged_dim=ragged_dim)
@@ -195,6 +198,7 @@ class RaggedTensor:
     A ragged `tensor` is a collection of 2D tensors that share the same number of columns.
     Each tensor in this collection is called a `slice`.
     """
+
     # slice_sizes[i] is the number of rows in slice `i`
     slice_sizes: torch.Tensor
     # ragged tensors are stored in memory as (potentially padded) 2D tensors of shape
diff --git a/python/triton_kernels/triton_kernels/tensor_details/layout_details/blackwell_scale.py b/python/triton_kernels/triton_kernels/tensor_details/layout_details/blackwell_scale.py