[mxfp] support quant/dequant from/to fp32 (triton-lang#7672)

jongsoo-openai · web-flow · commit fac1734511ae · 2025-08-21T07:14:27.000Z
It's slightly inconvenient to go through fp16/bf16 when we want to (de)quantize mxfp from/to fp32 # New contributor declaration - [x] I am not making a trivial change, such as fixing a typo in a comment. - [x] I have written a PR description following these [rules](https://cbea.ms/git-commit/#why-not-how). - [x] I have run `pre-commit run --from-ref origin/main --to-ref HEAD`. - Select one of the following. - [x] I have added tests. - `/test` for `lit` tests - `/unittest` for C++ tests - `/python/test` for end-to-end tests -> Added more test cases in ```pytest -xs python/triton_kernels/tests/test_mxfp.py``` - [ ] This PR does not need a test because `FILL THIS IN`. - Select one of the following. - [x] I have not added any `lit` tests. - [ ] The `lit` tests I have added follow these [best practices](https://mlir.llvm.org/getting_started/TestingGuide/#filecheck-best-practices), including the "tests should be minimal" section. (Usually running Python code and using the instructions it generates is not minimal.)
diff --git a/python/triton_kernels/tests/test_mxfp.py b/python/triton_kernels/tests/test_mxfp.py
@@ -1,6 +1,5 @@
 import pytest
 import torch
-
 from triton_kernels.numerics_details.mxfp import (
     DequantScaleRoundingMode,
     downcast_to_mxfp,
@@ -16,7 +15,7 @@ def dtype_str_to_torch(dtype_str: str) -> torch.dtype:
     return torch.uint8 if dtype_str == "float4_e2m1" else getattr(torch, dtype_str)
 
 
-@pytest.mark.parametrize("dst_dtype", ["float16", "bfloat16"])
+@pytest.mark.parametrize("dst_dtype", ["float16", "bfloat16", "float32"])
 def test_mxfp4_rounding_cases(dst_dtype):
     dst_dtype = dtype_str_to_torch(dst_dtype)
     x = torch.tensor([6, 0, 0.24, 0.25, 0.75, 0.99, 1.2, 1.3]).cuda().bfloat16().view(1, -1, 1)
@@ -33,7 +32,7 @@ def test_mxfp4_rounding_cases(dst_dtype):
 
 
 @pytest.mark.parametrize("src_dtype", ["float4_e2m1", "float8_e5m2", "float8_e4m3fn"])
-@pytest.mark.parametrize("dst_dtype", ["float16", "bfloat16"])
+@pytest.mark.parametrize("dst_dtype", ["float16", "bfloat16", "float32"])
 def test_mxfp_quant_dequant(src_dtype, dst_dtype):
     if "float8" in src_dtype and torch.cuda.get_device_capability()[0] < 9:
         pytest.skip("Float8 not tested on A100")
@@ -79,7 +78,7 @@ def test_mxfp_quant_dequant(src_dtype, dst_dtype):
     ],
 )
 # fmt: on
-@pytest.mark.parametrize("dequant_dtype", ["float16", "bfloat16"])
+@pytest.mark.parametrize("dequant_dtype", ["float16", "bfloat16", "float32"])
 def test_mxfp_casting(
     shape: tuple[int, ...],
     axis: int,
diff --git a/python/triton_kernels/triton_kernels/numerics_details/mxfp.py b/python/triton_kernels/triton_kernels/numerics_details/mxfp.py
@@ -83,7 +83,7 @@ def upcast_from_mxfp(tensor: torch.Tensor, scale: torch.Tensor, dtype: torch.dty
     assert tensor.dtype in {torch.uint8, torch.float8_e5m2, torch.float8_e4m3fn}, \
         f"Invalid tensor dtype {tensor.dtype=}"
     assert scale.dtype == torch.uint8, f"Invalid scale dtype {scale.dtype=}"
-    assert dtype in (torch.float16, torch.bfloat16), f"Invalid output dtype {dtype=}"
+    assert dtype in (torch.float16, torch.bfloat16, torch.float32), f"Invalid output dtype {dtype=}"
     # upcast
     logical_quant_dim = tensor.shape[axis] * (2 if tensor.dtype == torch.uint8 else 1)
     tensor = tensor.transpose(axis, tensor.ndim - 1).contiguous()
diff --git a/python/triton_kernels/triton_kernels/numerics_details/mxfp_details/_downcast_to_mxfp.py b/python/triton_kernels/triton_kernels/numerics_details/mxfp_details/_downcast_to_mxfp.py
@@ -107,7 +107,7 @@ def _downcast_to_mxfp(mx_tensor_ptr, stride_mxt_outer, stride_mxt_quant: tl.cons
 
     src_dtype: tl.constexpr = src_ptr.dtype.element_ty
     tl.static_assert(mx_scale_ptr.dtype.element_ty == tl.uint8, f"{mx_scale_ptr.dtype.element_ty=} must be uint8")
-    tl.static_assert((src_dtype == tl.bfloat16) or (src_dtype == tl.float16), f"{src_dtype=} must be bfloat16 or float16")
+    tl.static_assert((src_dtype == tl.bfloat16) or (src_dtype == tl.float16) or (src_dtype == tl.float32), f"{src_dtype=} must be bfloat16 or float16 or float32")
     is_fp4: tl.constexpr = mx_tensor_dtype == tl.uint8
 
     outer_block = tl.program_id(0).to(tl.int64)
diff --git a/python/triton_kernels/triton_kernels/numerics_details/mxfp_details/_upcast_from_mxfp.py b/python/triton_kernels/triton_kernels/numerics_details/mxfp_details/_upcast_from_mxfp.py
@@ -1,5 +1,6 @@
 import triton
 import triton.language as tl
+
 from ._downcast_to_mxfp import MXFP_BLOCK_SIZE
 
 
@@ -14,7 +15,7 @@ def _upcast_from_mxfp(out_ptr, stride_o_outer, stride_o_quant: tl.constexpr, mx_
     # uint8 signifies two fp4 e2m1 values packed into a single byte
     mx_tensor_dtype: tl.constexpr = mx_tensor_ptr.dtype.element_ty
     dst_dtype: tl.constexpr = out_ptr.dtype.element_ty
-    tl.static_assert(dst_dtype == tl.float16 or dst_dtype == tl.bfloat16)
+    tl.static_assert(dst_dtype == tl.float16 or dst_dtype == tl.bfloat16 or dst_dtype == tl.float32)
     tl.static_assert(
         mx_tensor_dtype == tl.uint8
         or ((mx_tensor_dtype == tl.float8e4nv or mx_tensor_dtype == tl.float8e5) or mx_tensor_dtype == dst_dtype),
@@ -69,32 +70,33 @@ def _upcast_from_mxfp(out_ptr, stride_o_outer, stride_o_quant: tl.constexpr, mx_
     if dst_dtype == tl.bfloat16:
         dst_scale = (scale.to(tl.uint16) << 7).to(dst_dtype, bitcast=True)
     else:
-        tl.static_assert(dst_dtype == tl.float16)
         dst_scale = (scale.to(tl.uint32) << 23).to(tl.float32, bitcast=True)
-        dst_scale = dst_scale.to(tl.float16)
+        if dst_dtype == tl.float16:
+            dst_scale = dst_scale.to(tl.float16)
 
     # Now upcast the tensor.
+    intermediate_dtype: tl.constexpr = tl.bfloat16 if dst_dtype == tl.float32 else dst_dtype
     if is_fp8:
-        dst_tensor = tensor.to(dst_dtype)
+        dst_tensor = tensor.to(intermediate_dtype)
         if tensor.dtype == tl.float8e5:
             from_e_bits: tl.constexpr = 5
             from_m_bits: tl.constexpr = 2
-            to_e_bits: tl.constexpr = 8 if dst_dtype == tl.bfloat16 else 5
-            to_m_bits: tl.constexpr = 7 if dst_dtype == tl.bfloat16 else 10
+            to_e_bits: tl.constexpr = 8 if intermediate_dtype == tl.bfloat16 else 5
+            to_m_bits: tl.constexpr = 7 if intermediate_dtype == tl.bfloat16 else 10
 
             # Preserve infs and nans. FIXME Fp8E5M2_to_Bf16 doesn't preserve them!
             non_finite_mask_src: tl.constexpr = ((1 << from_e_bits) - 1) << from_m_bits
             non_finite_mask_dst: tl.constexpr = ((1 << to_e_bits) - 1) << to_m_bits
             dst_tensor = tl.where(
                 (tensor.to(tl.uint8, bitcast=True) & non_finite_mask_src) == non_finite_mask_src,
-                (dst_tensor.to(tl.uint16, bitcast=True) | non_finite_mask_dst).to(dst_dtype, bitcast=True),
+                (dst_tensor.to(tl.uint16, bitcast=True) | non_finite_mask_dst).to(intermediate_dtype, bitcast=True),
                 dst_tensor,
             )
     else:
         assert is_fp4
-        dst_bias: tl.constexpr = 127 if dst_dtype == tl.bfloat16 else 15
-        dst_0p5: tl.constexpr = 16128 if dst_dtype == tl.bfloat16 else 0x3800
-        dst_m_bits: tl.constexpr = 7 if dst_dtype == tl.bfloat16 else 10
+        dst_bias: tl.constexpr = 127 if intermediate_dtype == tl.bfloat16 else 15
+        dst_0p5: tl.constexpr = 16128 if intermediate_dtype == tl.bfloat16 else 0x3800
+        dst_m_bits: tl.constexpr = 7 if intermediate_dtype == tl.bfloat16 else 10
         # e2m1
         em0 = tensor & 0x07
         em1 = tensor & 0x70
@@ -108,7 +110,8 @@ def _upcast_from_mxfp(out_ptr, stride_o_outer, stride_o_quant: tl.constexpr, mx_
         x0 = tl.where(em0 == 0x01, dst_0p5 | (x0 & 0x8000), x0)
         x1 = tl.where(em1 == 0x10, dst_0p5 | (x1 & 0x8000), x1)
         # 3) x is zero, do nothing
-        dst_tensor = tl.interleave(x0, x1).to(dst_dtype, bitcast=True)
+        dst_tensor = tl.interleave(x0, x1).to(intermediate_dtype, bitcast=True)
+    dst_tensor = dst_tensor.to(dst_dtype)
 
     # Reshape for proper broadcasting: the scale was stored with a 32‐sized “inner” grouping.
     dst_tensor = dst_tensor.reshape([BLOCK_SIZE_OUT_DIM, BLOCK_SIZE_QUANT_MX_SCALE, MXFP_BLOCK_SIZE])