[TRITON_KERNELS] pad tensors in HopperValue layout (#8677)

ptillet · web-flow · commit 4327b5b62f2a · 2025-11-08T21:41:43.000-08:00
diff --git a/python/triton_kernels/tests/test_matmul.py b/python/triton_kernels/tests/test_matmul.py
@@ -376,9 +376,6 @@ def _test_op(m, n, k, split_k, do_gather, do_scatter, inner_expt_opt, has_y_gamm
         if torch.cuda.get_device_capability()[0] < 10:
             if "mxfloat4" not in weight_dtype_str:
                 pytest.skip("NYI. Hopper swizzling just implemented for mxfp4.")
-            if k % 64 != 0 or n % 64 != 0:
-                # Automatic padding not implemented for Hopper swizzle
-                pytest.skip("Hopper swizzling acts on a 64x64 tile (4x1 mma tiles).")
 
     expt_is_inner = (inner_expt_opt is not None)
     if expt_is_inner:
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/_common.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/_common.py
@@ -73,6 +73,7 @@ def _load_tile_attrs(
     SPLIT_K: tl.constexpr,
     GROUP_M: tl.constexpr,
     XCD_SWIZZLE: tl.constexpr,
+    SWIZZLE_MX_VALUE: tl.constexpr,
 ):
     # unpack and swizzle program ids
     pid_emnk = tile_id
@@ -116,6 +117,8 @@ def _load_tile_attrs(
             K_W = K * (PACKED_BLOCK_K_W // BLOCK_K)
         else:
             K_W = K // (BLOCK_K // PACKED_BLOCK_K_W)
+        if SWIZZLE_MX_VALUE == "HOPPER_VALUE":
+            K_W = tl.cdiv(K_W, 128) * 128
         k_tiles = tl.cdiv(K - off_k_x, BLOCK_K * SPLIT_K)
         if ExptData is None:
             tl.static_assert(M is not None)
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/_matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/_matmul_ogs.py
@@ -222,7 +222,7 @@ def _matmul_ogs(
                          M, K, ExptData, ExptHist, ExptOffs, ExptTileOffs,
                          EXPT_IS_INNER, X_IS_PADDED, W_IS_PADDED,
                          BLOCK_M, BLOCK_K, PACKED_BLOCK_K_W, SPLIT_K,
-                         GROUP_M, XCD_SWIZZLE)
+                         GROUP_M, XCD_SWIZZLE, SWIZZLE_MX_VALUE)
 
     # For split-k, advance to the output k slice
     if SPLIT_K > 1:
@@ -290,7 +290,10 @@ def _matmul_ogs(
 
     # B pointers
     offs_w_n = pid_n * PACKED_BLOCK_N_W + tl.arange(0, PACKED_BLOCK_N_W)
-    offs_w_n = tl.max_contiguous(tl.multiple_of(offs_w_n % (N // W_N_DIVISOR), PACKED_BLOCK_N_W), PACKED_BLOCK_N_W)
+    N_W = N
+    if SWIZZLE_MX_VALUE == "HOPPER_VALUE":
+        N_W = tl.cdiv(N_W, 64) * 64
+    offs_w_n = tl.max_contiguous(tl.multiple_of(offs_w_n % (N_W // W_N_DIVISOR), PACKED_BLOCK_N_W), PACKED_BLOCK_N_W)
 
     if is_x_microscaled:
         XMxScale += start_z.to(index_type) * stride_x_mx_z
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/_p_matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/_p_matmul_ogs.py
@@ -217,7 +217,7 @@ def _p_matmul_ogs(
             M, K, ExptData, ExptHist, ExptOffs, ExptTileOffs,
             EXPT_IS_INNER, X_IS_PADDED, W_IS_PADDED,
             BLOCK_M, BLOCK_K, PACKED_BLOCK_K_W, SPLIT_K,
-            GROUP_M, XCD_SWIZZLE)
+            GROUP_M, XCD_SWIZZLE, SWIZZLE_MX_VALUE)
         off_n = BLOCK_N * pid_n
 
         # Base pointers and offsets.
@@ -347,7 +347,7 @@ def _p_matmul_ogs(
                 M, K, ExptData, ExptHist, ExptOffs, ExptTileOffs,
                 EXPT_IS_INNER, X_IS_PADDED, W_IS_PADDED,
                 BLOCK_M, BLOCK_K, PACKED_BLOCK_K_W, SPLIT_K,
-                GROUP_M, XCD_SWIZZLE)
+                GROUP_M, XCD_SWIZZLE, SWIZZLE_MX_VALUE)
             off_n1 = pid_n1 * BLOCK_N
         else:
             tile_id1, expt_id1, start_z1, start_m1, eM1 = tile_id, expt_id, start_z_out, start_m, eM
diff --git a/python/triton_kernels/triton_kernels/tensor_details/layout_details/hopper_value.py b/python/triton_kernels/triton_kernels/tensor_details/layout_details/hopper_value.py
@@ -120,6 +120,14 @@ def swizzle_data(self, data):
         batch = data.ndim - 2
         assert batch >= 0
         assert self.mma_version in (2, 3)
+        # Pre-pad both matrix dims to multiples of 64
+        *_, M_in, K_in = data.shape
+        SWIZZLE_ALIGN_M = 64
+        SWIZZLE_ALIGN_K = 64
+        pad_m = (SWIZZLE_ALIGN_M - (M_in % SWIZZLE_ALIGN_M)) % SWIZZLE_ALIGN_M
+        pad_k = (SWIZZLE_ALIGN_K - (K_in % SWIZZLE_ALIGN_K)) % SWIZZLE_ALIGN_K
+        data = torch.nn.functional.pad(data, (0, pad_k, 0, pad_m))
+
         data = self._maybe_mT(data)
         init_shape = data.shape