intel
diff --git a/‎python/triton_kernels/bench/bench_mlp.py
Lines changed: 28 additions & 7 deletions b/‎python/triton_kernels/bench/bench_mlp.py
Lines changed: 28 additions & 7 deletions
diff --git a/‎python/triton_kernels/tests/test_matmul.py
Lines changed: 53 additions & 25 deletions b/‎python/triton_kernels/tests/test_matmul.py
Lines changed: 53 additions & 25 deletions
@@ -1,10 +1,12 @@
 from pathlib import Path
+from copy import deepcopy
 import matplotlib.pyplot as plt
 import json
 import triton.profiler as proton
 import torch
+import triton_kernels
 import triton_kernels.swiglu
-from triton_kernels.numerics_details.mxfp import downcast_to_mxfp
+from triton_kernels.numerics_details.mxfp import downcast_to_mxfp, SwizzlingType
 from triton_kernels.matmul_ogs import MicroscalingCtx, matmul_ogs, PrecisionConfig, FlexCtx
 from triton_kernels.numerics import InFlexData
 from triton_kernels.routing import routing
@@ -61,10 +63,14 @@ def quantize(w, dtype, dev, **opt):
     else:
         assert dtype == "mx4", f"{dtype=}"
         swizzle_mx_scale = opt["swizzle_mx_scale"]
+        swizzle_mx_value = opt["swizzle_mx_value"]
         swizzle_axis = 2 if swizzle_mx_scale else None
         w = w.to(torch.bfloat16)
-        w, mx_scales, weight_scale_shape = downcast_to_mxfp(w, torch.uint8, axis=1, swizzle_axis=swizzle_axis)
-        return w, InFlexData(), MicroscalingCtx(weight_scale=mx_scales, swizzle_mx=swizzle_mx_scale,
+        w, mx_scales, weight_scale_shape = downcast_to_mxfp(w, torch.uint8, axis=1, swizzle_axis=swizzle_axis,
+                                                            swizzle_scale=swizzle_mx_scale,
+                                                            swizzle_value=swizzle_mx_value)
+        return w, InFlexData(), MicroscalingCtx(weight_scale=mx_scales, swizzle_scale=swizzle_mx_scale,
+                                                swizzle_value=swizzle_mx_value,
                                                 actual_weight_scale_shape=weight_scale_shape)
 
 
@@ -73,6 +79,7 @@ class PerfData:
     time: float
     flops: float
     bytes: float
+    bitwidth: int
 
     @property
     def tflops(self):
@@ -92,8 +99,9 @@ def opint(self):
     def util(self) -> float:
         if SPECS is None:
             return 0.0
+        assert self.bitwidth in (8, 16)
 
-        peak_flops = max(SPECS["MAX_TFLOPS8"], SPECS.get("MAX_TFLOPS16", 0))
+        peak_flops = SPECS["MAX_TFLOPS8"] if self.bitwidth == 8 else SPECS["MAX_TFLOPS16"]
         min_t_flop = self.flops / peak_flops * 1e-3  # ns → µs
         min_t_bw = self.bytes / SPECS["MAX_TBPS"] * 1e-3
         return max(min_t_flop, min_t_bw) / self.time
@@ -116,8 +124,21 @@ def bench_mlp(batch, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_dtype, TP,
 
     # -- numerics --
     optg = dict()
-    opt1 = {"swizzle_mx_scale": True} if w_dtype == "mx4" else dict()
-    opt2 = {"swizzle_mx_scale": True} if w_dtype == "mx4" else dict()
+    opt1 = dict()
+    opt2 = dict()
+    if w_dtype == "mx4" and not is_hip():
+        if torch.cuda.get_device_capability()[0] < 9:
+            # NYI for Ampere
+            swizzle_mx_value = None
+            swizzle_mx_scale = None
+        elif torch.cuda.get_device_capability()[0] < 10:
+            swizzle_mx_value = SwizzlingType.HOPPER
+            swizzle_mx_scale = SwizzlingType.HOPPER
+        else:
+            swizzle_mx_value = None
+            swizzle_mx_scale = SwizzlingType.BLACKWELL
+        opt1 = {"swizzle_mx_value": swizzle_mx_value, "swizzle_mx_scale": swizzle_mx_scale}
+        opt2 = deepcopy(opt1)
     wg, wg_flex, wg_mx = quantize(wg, "bf16", dev, **optg)
     w1, w1_flex, w1_mx = quantize(w1, w_dtype, dev, **opt1)
     w2, w2_flex, w2_mx = quantize(w2, w_dtype, dev, **opt2)
@@ -165,7 +186,7 @@ def bench_mlp(batch, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_dtype, TP,
         # TODO: proton should really be recording that in the json instead of
         # relying on the user to aggregate
         time = sum(x["metrics"].get("time (ns)", 0) for x in data[0]["children"])
-    return PerfData(time, flops, bytes)
+    return PerfData(time, flops, bytes, x_dtype.itemsize * 8)
 
 
 def roofline_mlp(batch_ranges, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_dtype, TP=1, EP=1, name="",
 
@@ -2,7 +2,6 @@
 import pytest
 import torch
 from typing import Union
-# benchmarking utilities
 # routing utilities
 from triton_kernels.routing import routing
 # matmul utilities
@@ -12,7 +11,7 @@
 from triton_kernels.matmul_ogs import matmul_ogs, matmul_ogs_torch
 # numerics utilities
 from triton_kernels.numerics import InFlexData, OutFlexData
-from triton_kernels.numerics_details.mxfp import downcast_to_mxfp, upcast_from_mxfp
+from triton_kernels.numerics_details.mxfp import SwizzlingType, downcast_to_mxfp, upcast_from_mxfp
 # testing utilities
 from triton_kernels.testing import assert_close, compute_actual_scale
 # target-specific utilities
@@ -139,7 +138,7 @@ class Case:
     n_expts_act: int = 1
     n_expt_shards: int = 1
     split_k: int = 1
-    swizzle_mx_scale: bool = False
+    hbm_swizzling: bool = False
     epilogue_subtile: Union[bool, None] = None
 
 
@@ -174,25 +173,28 @@ class Case:
             Case(1000, 700, 700, "ragged", "float16", "float16", 8, 2, split_k=9),
             # mx types:
             Case(16, 256, 256, "ragged", "bfloat16", "mxfloat4_e2m1", 128, 4),
+            Case(16, 256, 256, "ragged", "bfloat16", "mxfloat4_e2m1", 128, 4, hbm_swizzling=True),
             Case(1000, 700, 700, "batched", "bfloat16", "mxfloat4_e2m1", 8, 2),
+            Case(1000, 700, 700, "batched", "bfloat16", "mxfloat4_e2m1", 8, 2, hbm_swizzling=True),
             Case(1000, 700, 700, "ragged", "bfloat16", "mxfloat4_e2m1", 8, 2, split_k=9),
+            Case(1000, 512, 256, "ragged", "bfloat16", "mxfloat4_e2m1", 8, 2, split_k=9, hbm_swizzling=True),
             Case(300, 400, 400, "ragged", "bfloat16", "mxfloat8_e4m3fn", 8, 4),
-            Case(300, 400, 400, "ragged", "bfloat16", "mxfloat8_e4m3fn", 8, 4, swizzle_mx_scale=True),
+            Case(300, 400, 400, "ragged", "bfloat16", "mxfloat8_e4m3fn", 8, 4, hbm_swizzling=True),
             Case(300, 400, 400, "batched", "bfloat16", "mxfloat8_e5m2", 32, 4),
             Case(1000, 700, 2, "batched", "bfloat16", "mxfloat4_e2m1", 8, 2),
-            Case(16, 256, 256, "ragged", "float8_e5m2", "mxfloat4_e2m1", 128, 4, swizzle_mx_scale=True),
-            Case(1000, 704, 800, "batched", "float8_e5m2", "mxfloat4_e2m1", 3, 1, swizzle_mx_scale=True),
-            Case(1000, 704, 800, "batched", "float8_e5m2", "mxfloat4_e2m1", 3, 1, swizzle_mx_scale=False),
-            Case(1000, 704, 800, "ragged", "float8_e5m2", "mxfloat4_e2m1", 8, 2, split_k=9, swizzle_mx_scale=False),
-            Case(1000, 704, 800, "ragged", "float8_e5m2", "mxfloat4_e2m1", 8, 2, split_k=9, swizzle_mx_scale=True),
-            Case(1000, 704, 800, "ragged", "float8_e5m2", "mxfloat4_e2m1", 8, 2, swizzle_mx_scale=False),
-            Case(1000, 704, 800, "ragged", "float8_e5m2", "mxfloat4_e2m1", 8, 2, swizzle_mx_scale=True),
-            Case(300, 400, 400, "ragged", "float8_e5m2", "mxfloat8_e4m3fn", 8, 4, swizzle_mx_scale=False),
-            Case(300, 400, 400, "ragged", "float8_e5m2", "mxfloat8_e4m3fn", 8, 4, swizzle_mx_scale=True),
-            Case(300, 400, 800, "ragged", "float8_e5m2", "mxfloat4_e2m1", 8, 4, swizzle_mx_scale=False),
-            Case(300, 400, 800, "ragged", "float8_e5m2", "mxfloat4_e2m1", 8, 4, swizzle_mx_scale=True),
-            Case(300, 400, 400, "batched", "float8_e5m2", "mxfloat8_e4m3fn", 32, 4, swizzle_mx_scale=False),
-            Case(300, 400, 400, "batched", "float8_e5m2", "mxfloat8_e4m3fn", 32, 4, swizzle_mx_scale=True),
+            Case(16, 256, 256, "ragged", "float8_e5m2", "mxfloat4_e2m1", 128, 4, hbm_swizzling=True),
+            Case(1000, 704, 800, "batched", "float8_e5m2", "mxfloat4_e2m1", 3, 1, hbm_swizzling=True),
+            Case(1000, 704, 800, "batched", "float8_e5m2", "mxfloat4_e2m1", 3, 1),
+            Case(1000, 704, 800, "ragged", "float8_e5m2", "mxfloat4_e2m1", 8, 2, split_k=9),
+            Case(1000, 704, 800, "ragged", "float8_e5m2", "mxfloat4_e2m1", 8, 2, split_k=9, hbm_swizzling=True),
+            Case(1000, 704, 800, "ragged", "float8_e5m2", "mxfloat4_e2m1", 8, 2),
+            Case(1000, 704, 800, "ragged", "float8_e5m2", "mxfloat4_e2m1", 8, 2, hbm_swizzling=True),
+            Case(300, 400, 400, "ragged", "float8_e5m2", "mxfloat8_e4m3fn", 8, 4),
+            Case(300, 400, 400, "ragged", "float8_e5m2", "mxfloat8_e4m3fn", 8, 4, hbm_swizzling=True),
+            Case(300, 400, 800, "ragged", "float8_e5m2", "mxfloat4_e2m1", 8, 4),
+            Case(300, 400, 800, "ragged", "float8_e5m2", "mxfloat4_e2m1", 8, 4, hbm_swizzling=True),
+            Case(300, 400, 400, "batched", "float8_e5m2", "mxfloat8_e4m3fn", 32, 4),
+            Case(300, 400, 400, "batched", "float8_e5m2", "mxfloat8_e4m3fn", 32, 4, hbm_swizzling=True),
             # AMD
             Case(300, 400, 400, "ragged", "float8_e4m3fnuz", "float8_e4m3fnuz"),
             Case(1000, 400, 400, "ragged", "float8_e4m3fnuz", "float8_e4m3fnuz", 3, 1),
@@ -214,8 +216,8 @@ class Case:
 @pytest.mark.parametrize("has_y_gammas", [False, True])
 @pytest.mark.parametrize("is_persistent", [False, True])
 def test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, has_y_gammas, is_persistent, n_expts_tot,
-            n_expts_act, n_expt_shards, mode, act_dtype_str, weight_dtype_str, block_m, swizzle_mx_scale,
-            epilogue_subtile, device):
+            n_expts_act, n_expt_shards, mode, act_dtype_str, weight_dtype_str, block_m, hbm_swizzling, epilogue_subtile,
+            device):
     # TODO: remove when Triton FP8 supports proper RTNE
     if "float8" in weight_dtype_str and torch.cuda.get_device_capability()[0] < 9:
         pytest.skip("Float8 not tested on A100")
@@ -229,11 +231,22 @@ def test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, has_y_gammas
         pytest.skip("float8 x mx not supported with cuda capability < 10")
     if fused_scatter and split_k > 1:
         pytest.skip("fused scatter scratchpad not supported with split_k")
+    if hbm_swizzling:
+        if is_hip():
+            pytest.skip("NYI. HBM swizzling just implemented for CUDA.")
+        if torch.cuda.get_device_capability()[0] < 9:
+            pytest.skip("NYI. Ampere swizzling.")
+        if torch.cuda.get_device_capability()[0] < 10:
+            if "mxfloat4" not in weight_dtype_str:
+                pytest.skip("NYI. Hopper swizzling just implemented for mxfp4.")
+            if k % 64 != 0 or n % 64 != 0:
+                # Automatic padding not implemented for Hopper swizzle
+                pytest.skip("Hopper swizzling acts on a 64x64 tile (4x1 mma tiles).")
 
     torch.manual_seed(0)
 
     block_k = None
-    if is_persistent and weight_dtype_str.startswith("mx") and not torch.cuda.get_device_capability()[0] >= 10:
+    if is_persistent and weight_dtype_str.startswith("mx") and torch.cuda.get_device_capability()[0] < 10:
         # Override block_k for testing correctness. The default is temporarily 128 for
         # performance reasons which doesn't work with persistent matmul.
         # TODO: revisit when Triton is better for H100 + MXFP4
@@ -273,12 +286,27 @@ def test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, has_y_gammas
     x_ref, w_ref, bias_ref, gs0_ref, gs1_ref = apply_precision(x_tri, w_tri, bias_tri, gs0_tri, gs1_tri, precision_opt)
 
     if is_mixed_input:
-        swizzle_axis = 2 if swizzle_mx_scale else None
+        if hbm_swizzling:
+            swizzle_axis = 2
+            if torch.cuda.get_device_capability()[0] < 10:
+                swizzle_value = SwizzlingType.HOPPER
+                swizzle_scale = SwizzlingType.HOPPER
+            else:
+                swizzle_value = None
+                swizzle_scale = SwizzlingType.BLACKWELL
+        else:
+            swizzle_axis = None
+            swizzle_value = None
+            swizzle_scale = None
         w_tri, mx_scales_tri, weight_scale_shape = downcast_to_mxfp(w_tri, weight_dtype, axis=1,
-                                                                    swizzle_axis=swizzle_axis)
-        w_ref = upcast_from_mxfp(w_tri, mx_scales_tri, torch.bfloat16, axis=1, swizzle_axis=swizzle_axis)
-
-        precision_opt.mx_ctx = MicroscalingCtx(weight_scale=mx_scales_tri, swizzle_mx=swizzle_mx_scale,
+                                                                    swizzle_axis=swizzle_axis,
+                                                                    swizzle_value=swizzle_value,
+                                                                    swizzle_scale=swizzle_scale)
+        w_ref = upcast_from_mxfp(w_tri, mx_scales_tri, torch.bfloat16, axis=1, swizzle_axis=swizzle_axis,
+                                 swizzle_value=swizzle_value, swizzle_scale=swizzle_scale)
+
+        precision_opt.mx_ctx = MicroscalingCtx(weight_scale=mx_scales_tri, swizzle_value=swizzle_value,
+                                               swizzle_scale=swizzle_scale,
                                                actual_weight_scale_shape=weight_scale_shape)
 
     if is_persistent and not can_use_persistent_tma(x_tri, w_tri, gindx, precision_opt):