1009 round down largest power of 2 (#8413)

jongsoo-openai · web-flow · commit 2726dacd6ce7 · 2025-10-10T19:53:19.000Z
OCP standard recommends "Set 𝑋 to be the largest power-of-two1 less than or equal to max 𝑉𝑖∈𝑉 (|𝑉𝑖|), divided by the largest power-of-two representable in the element data type." This PR changes the behavior of DequantScaleRoundingMode.ROUND_DOWN to follow this. Since ROUND_UP was the default and ROUND_DOWN would've caused lot of clipping assume not used. This PR should make ROUND_DOWN mode to be more useful.  # New contributor declaration - [x] I am not making a trivial change, such as fixing a typo in a comment. - [x] I have written a PR description following these [rules](https://cbea.ms/git-commit/#why-not-how). - [x] I have run `pre-commit run --from-ref origin/main --to-ref HEAD`. - Select one of the following. - [x] I have added tests. - `/test` for `lit` tests - `/unittest` for C++ tests - `/python/test` for end-to-end tests - [ ] This PR does not need a test because `FILL THIS IN`. - Select one of the following. - [x] I have not added any `lit` tests. - [ ] The `lit` tests I have added follow these [best practices](https://mlir.llvm.org/getting_started/TestingGuide/#filecheck-best-practices), including the "tests should be minimal" section. (Usually running Python code and using the instructions it generates is not minimal.)
diff --git a/python/triton_kernels/tests/test_mxfp.py b/python/triton_kernels/tests/test_mxfp.py
@@ -44,6 +44,34 @@ def test_mxfp4_rounding_cases(dst_dtype, device):
     dequant_torch = upcast_from_mxfp_torch(quant_torch, scale_torch, dst_dtype, axis=1)
     assert_equal(dequant_torch, dequant)
 
+    # ROUND_DOWN should use the max power-of-two when computing scale.
+    # Choose a block whose max is 33 so the chosen scale is
+    # 2**floor(log2(33/(e2m1 max power of 2 = 4)) = 2**3 = 8 (exponent 127+3),
+    # and the other values are multiples of representable FP4 values times 8
+    # that allow exact reconstruction.
+    x = torch.tensor([33.0, 24.0, 16.0, 8.0, 4.0, 0.0, -32.0, 0.0], device=device).bfloat16().view(1, -1, 1)
+    quant, scale = downcast_to_mxfp(
+        x,
+        torch.uint8,
+        axis=1,
+        DEQUANT_SCALE_ROUNDING_MODE=DequantScaleRoundingMode.ROUND_DOWN,
+    )
+    dequant = upcast_from_mxfp(quant, scale, dst_dtype, axis=1)
+    assert_equal(dequant[0, 1:, :], x[0, 1:, :])
+
+    # Golden: scale exponent is 127 + 3 for 2**3 = 8
+    assert scale.item() == 127 + 3
+
+    # Torch reference path should match
+    quant_torch, scale_torch = downcast_to_mxfp_torch(
+        x,
+        torch.uint8,
+        axis=1,
+        DEQUANT_SCALE_ROUNDING_MODE=DequantScaleRoundingMode.ROUND_DOWN,
+    )
+    assert_equal(quant_torch, quant)
+    assert_equal(scale_torch, scale)
+
 
 @pytest.mark.parametrize("src_dtype", ["float4_e2m1", "float8_e5m2", "float8_e4m3fn"])
 @pytest.mark.parametrize("dst_dtype", ["float16", "bfloat16", "float32"])
diff --git a/python/triton_kernels/triton_kernels/numerics_details/mxfp.py b/python/triton_kernels/triton_kernels/numerics_details/mxfp.py
@@ -1,6 +1,7 @@
 # isort: off
 # fmt: off
 from enum import Enum
+import math
 import triton
 import torch
 import torch.nn.functional as F
@@ -13,7 +14,10 @@
 
 
 class DequantScaleRoundingMode(Enum):
+    # 2^round_up(log2(max/max_q)) avoids clipping the max value
     ROUND_UP = 0
+    # 2^round_down(log2(max/max_power_of_2_q)) follows the OCP standard ~50% of
+    # chance of clipping the max value.
     ROUND_DOWN = 1
 
 
@@ -176,7 +180,10 @@ def downcast_to_mxfp_torch(src_tensor: torch.Tensor, out_quant_type: torch.dtype
 
     # Choose a max quantization value depending on type.
     max_quant_val = get_max_quant_val(out_quant_type)
-    dequant_scale = max_val / max_quant_val  # shape: (..., padded_axis_shape//32, 1)
+    if DEQUANT_SCALE_ROUNDING_MODE == DequantScaleRoundingMode.ROUND_UP:
+        dequant_scale = max_val / max_quant_val  # shape: (..., padded_axis_shape//32, 1)
+    else:
+        dequant_scale = max_val / (2 ** math.floor(math.log2(max_quant_val)))
 
     # Convert to int to round the FP32 scale, prior to quantization!
     ds_int = dequant_scale.view(torch.int32)
diff --git a/python/triton_kernels/triton_kernels/numerics_details/mxfp_details/_downcast_to_mxfp.py b/python/triton_kernels/triton_kernels/numerics_details/mxfp_details/_downcast_to_mxfp.py
@@ -18,6 +18,15 @@ def _get_max_quant_val(dtype: tl.constexpr):
     else:
         tl.static_assert(False, f"Invalid {dtype=}")
 
+@triton.jit
+def _get_max_power_of_2_quant_val(dtype: tl.constexpr):
+    if dtype == tl.uint8:
+        return 4.0
+    elif dtype == tl.float8e5:
+        return 32768.0
+    elif dtype == tl.float8e4nv:
+        return 256.0
+
 @triton.jit
 def _compute_quant_and_scale(src_tensor, valid_src_mask, mx_tensor_dtype: tl.constexpr,
                              DEQUANT_SCALE_ROUNDING_MODE: tl.constexpr = 0):
@@ -32,18 +41,19 @@ def _compute_quant_and_scale(src_tensor, valid_src_mask, mx_tensor_dtype: tl.con
     abs_tensor = tl.where(valid_src_mask, abs_tensor, -1.0)  # Don't consider padding tensors in scale computation
     abs_tensor = tl.reshape(abs_tensor, [BLOCK_SIZE_OUT_DIM, BLOCK_SIZE_QUANT_MX_SCALE, MXFP_BLOCK_SIZE])
     max_val = tl.max(abs_tensor, axis=2, keep_dims=True)
-    dequant_scale = max_val / _get_max_quant_val(mx_tensor_dtype)
     if DEQUANT_SCALE_ROUNDING_MODE == 0:
         # DequantScaleRoundingMode.ROUND_UP
         # compute 2 ** ceil(log2(dequant_scale))
         # Adding 0x007FFFFF adds exponent by 1 unless mantissa is all zeros
         # A corner case: exponent is 0xFF that will overflow but that's already
         # NaN so assume we don't care.
+        dequant_scale = max_val / _get_max_quant_val(mx_tensor_dtype)
         dequant_scale_exponent = (dequant_scale.to(tl.uint32, bitcast=True) + 0x007FFFFF) & 0x7F800000
     else:
         # DequantScaleRoundingMode.ROUND_DOWN
         # compute 2 ** floor(log2(dequant_scale))
         assert DEQUANT_SCALE_ROUNDING_MODE == 1
+        dequant_scale = max_val / _get_max_power_of_2_quant_val(mx_tensor_dtype)
         dequant_scale_exponent = dequant_scale.to(tl.uint32, bitcast=True) & 0x7F800000
     dequant_scale_rounded = dequant_scale_exponent.to(tl.float32, bitcast=True)
     quant_scale = tl.where(dequant_scale_rounded == 0, 0, 1.0 / dequant_scale_rounded)