[TRITON_KERNELS] Apply MXFP4 Hopper layout on A100 (#8474)

hyouklee0 · web-flow · commit 28533b14fc35 · 2025-10-17T14:43:18.000-07:00
MXFP4 matmul performance is better with this layout on A100, so change the default layout. Also changed the layout names to reflect that it's used for both Hopper and Ampere. # New contributor declaration - [x] I am not making a trivial change, such as fixing a typo in a comment. - [x] I have written a PR description following these [rules](https://cbea.ms/git-commit/#why-not-how). - [x] I have run `pre-commit run --from-ref origin/main --to-ref HEAD`. - Select one of the following. - [ ] I have added tests. - `/test` for `lit` tests - `/unittest` for C++ tests - `/python/test` for end-to-end tests - [x] This PR does not need a test because the change should be covered with existing tests. - Select one of the following. - [x] I have not added any `lit` tests. - [ ] The `lit` tests I have added follow these [best practices](https://mlir.llvm.org/getting_started/TestingGuide/#filecheck-best-practices), including the "tests should be minimal" section. (Usually running Python code and using the instructions it generates is not minimal.)
diff --git a/python/triton_kernels/tests/test_tensor_details/test_layout_hopper.py b/python/triton_kernels/tests/test_tensor_details/test_layout_hopper.py
@@ -1,7 +1,7 @@
 import pytest
 from triton._internal_testing import is_cuda
 from triton_kernels.tensor import wrap_torch_tensor, convert_layout, FP4
-from triton_kernels.tensor_details.layout import HopperMXScaleLayout, HopperMXValueLayout
+from triton_kernels.tensor_details.layout import HopperAmpereMXScaleLayout, HopperAmpereMXValueLayout
 from triton_kernels.numerics_details.mxfp import downcast_to_mxfp, upcast_from_mxfp
 from triton_kernels.tensor_details.layout_details.hopper_value import mxfp4_to_bf16_triton
 from triton_kernels.tensor_details.layout_details.hopper_scale import unswizzle_mxfp4_scale_hopper
@@ -25,7 +25,7 @@ def test_mxfp4_value_roundtrip(shape, trans, mx_axis, mma_version):
         x = x.mT
     if x.shape[1 - mx_axis] < 32:
         pytest.skip("Not enough elements along non-mx axis")
-    layout = HopperMXValueLayout(x.shape, mx_axis, mma_version)
+    layout = HopperAmpereMXValueLayout(x.shape, mx_axis, mma_version)
     res = layout.unswizzle_data(layout.swizzle_data(x))
     assert (res == x).all()
 
@@ -35,7 +35,7 @@ def test_mxfp4_value_roundtrip(shape, trans, mx_axis, mma_version):
 @pytest.mark.parametrize("shape", [(256, 64), (256, 128), (256, 256)])
 def test_mxfp4_scale_roundtrip(shape, mx_axis, num_warps):
     x = torch.randint(0, 256, shape, dtype=torch.uint8, device="cuda")
-    layout = HopperMXScaleLayout(x.shape, mx_axis=mx_axis, num_warps=num_warps)
+    layout = HopperAmpereMXScaleLayout(x.shape, mx_axis=mx_axis, num_warps=num_warps)
     res = layout.unswizzle_data(layout.swizzle_data(x))
     assert (res[:shape[0], :shape[1]] == x).all()
 
@@ -84,8 +84,8 @@ def test_upcast_mxfp4_to_bf16():
     x_bf16 = upcast_from_mxfp(x_fp4_val, x_fp4_scale, x.dtype, axis=mx_axis)
     x_fp4_val = wrap_torch_tensor(x_fp4_val, dtype=FP4)
     x_fp4_scale = wrap_torch_tensor(x_fp4_scale)
-    x_fp4_val = convert_layout(x_fp4_val, HopperMXValueLayout, mx_axis=mx_axis)
-    x_fp4_scale = convert_layout(x_fp4_scale, HopperMXScaleLayout, mx_axis=mx_axis, num_warps=num_warps)
+    x_fp4_val = convert_layout(x_fp4_val, HopperAmpereMXValueLayout, mx_axis=mx_axis)
+    x_fp4_scale = convert_layout(x_fp4_scale, HopperAmpereMXScaleLayout, mx_axis=mx_axis, num_warps=num_warps)
     y = torch.empty_like(x_bf16)
     _upcast_mxfp4_to_bf16[(1, )](
         y, x_fp4_val.storage.data, x_fp4_scale.storage.data,  #
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags_details/opt_flags_nvidia.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags_details/opt_flags_nvidia.py
@@ -2,7 +2,7 @@
 import triton
 from triton_kernels import target_info
 from triton_kernels.tensor import get_layout, bitwidth, FP4
-from triton_kernels.tensor_details.layout import HopperMXScaleLayout
+from triton_kernels.tensor_details.layout import HopperAmpereMXScaleLayout
 from triton_kernels.numerics_details.mxfp_details._downcast_to_mxfp import MXFP_BLOCK_SIZE
 
 
@@ -18,7 +18,7 @@ def compute_grid_size(routing_data, batch_size, m, n, block_m, block_n):
 def compute_block_n(n: int, arch, precision_config):
     # block_n:
     layout = get_layout(precision_config.weight_scale)
-    if isinstance(layout, HopperMXScaleLayout) and layout.num_warps == 4:
+    if isinstance(layout, HopperAmpereMXScaleLayout) and layout.num_warps == 4:
         return 128, 128
     elif precision_config.max_num_imprecise_acc is None and n > 128:
         return 256, 256
@@ -60,7 +60,7 @@ def compute_split_k(block_k: int, k: int | None, grid_size: int) -> int:
 
 def compute_num_warps(block_m, block_n, is_persistent: bool, precision_config):
     layout = get_layout(precision_config.weight_scale)
-    if isinstance(layout, HopperMXScaleLayout):
+    if isinstance(layout, HopperAmpereMXScaleLayout):
         return layout.num_warps
     return max(block_m * block_n // 4096, 4 if is_persistent else 1)
 
diff --git a/python/triton_kernels/triton_kernels/tensor_details/layout.py b/python/triton_kernels/triton_kernels/tensor_details/layout.py
@@ -1,8 +1,8 @@
 from .layout_details.base import Layout
 from .layout_details.blackwell_scale import BlackwellMXScaleLayout
 from .layout_details.blackwell_value import BlackwellMXValueLayout
-from .layout_details.hopper_scale import HopperMXScaleLayout
-from .layout_details.hopper_value import HopperMXValueLayout
+from .layout_details.hopper_scale import HopperAmpereMXScaleLayout
+from .layout_details.hopper_value import HopperAmpereMXValueLayout
 from .layout_details.cdna4_scale import CDNA4MXScaleLayout
 from .layout_details.strided import StridedLayout
 from ..target_info import cuda_capability_geq, is_hip_cdna4
@@ -11,19 +11,18 @@
     "Layout",
     "BlackwellMXValueLayout",
     "BlackwellMXScaleLayout",
-    "HopperMXScaleLayout",
-    "HopperMXValueLayout",
+    "HopperAmpereMXScaleLayout",
+    "HopperAmpereMXValueLayout",
     "CDNA4MXScaleLayout",
     "StridedLayout",
 ]
 
 
 def make_default_matmul_mxfp4_w_layout(mx_axis: int):
     if cuda_capability_geq(10):
-        # return StridedLayout, dict()
         return BlackwellMXValueLayout, dict()
-    elif cuda_capability_geq(9):
-        return HopperMXValueLayout, {"mx_axis": mx_axis}
+    elif cuda_capability_geq(8):
+        return HopperAmpereMXValueLayout, {"mx_axis": mx_axis}
     else:
         return StridedLayout, dict()
 
@@ -34,7 +33,7 @@ def make_default_matmul_mxfp4_w_scale_layout(mx_axis: int, num_warps: int = 8):
     else:
         if cuda_capability_geq(10):
             return BlackwellMXScaleLayout, dict()
-        elif cuda_capability_geq(9):
-            return HopperMXScaleLayout, {"mx_axis": mx_axis, "num_warps": num_warps}
+        elif cuda_capability_geq(8):
+            return HopperAmpereMXScaleLayout, {"mx_axis": mx_axis, "num_warps": num_warps}
 
     return StridedLayout, dict()
diff --git a/python/triton_kernels/triton_kernels/tensor_details/layout_details/hopper_scale.py b/python/triton_kernels/triton_kernels/tensor_details/layout_details/hopper_scale.py
@@ -4,7 +4,7 @@
 from .base import Layout
 
 
-class HopperMXScaleLayout(Layout):
+class HopperAmpereMXScaleLayout(Layout):
     name: str = "HOPPER_SCALE"
 
     def __init__(self, shape, mx_axis, num_warps=8) -> None:
diff --git a/python/triton_kernels/triton_kernels/tensor_details/layout_details/hopper_value.py b/python/triton_kernels/triton_kernels/tensor_details/layout_details/hopper_value.py
@@ -82,7 +82,7 @@ def _unpack_bits(x, mx_axis: int):
 # -----------------------------------------------------------------------
 
 
-class HopperMXValueLayout(Layout):
+class HopperAmpereMXValueLayout(Layout):
     name: str = "HOPPER_VALUE"
 
     def __init__(self, shape, mx_axis, mma_version=3):