[MXFP] mxfp conversions speedup (#8610)

farzad-openai · jongsoo-openai · web-flow · commit a21bbbc1c02b · 2025-11-05T21:02:43.000-08:00
This PR improves the throughput of mxfp8 upcast and downcast operations. I included a commit from @jongsoo-openai (original PR [here](triton-lang/triton#8179)) and added improvements below on top of it. The PR is functionally a no-op, which is verified by the tests in ``python/triton_kernels/tests/test_mxfp.py``. Upcast improvements: - Added native packed e2m1 conversion to fp16 (for Blackwell+). - Added tensor descriptors to utilize TMA for reading the input mxfp value tensor and writing the output. - Note that this addition required adding padding for the innermost dimension for IO tensors not adhering to tensor descriptor specification requirements, and unpadding the output afterwards. - Tuned tile dimensions and num_warps. Downcast improvements: - Enabled vectorized store of mxfp4 value tensors (h/t to @ThomasRaoux), as opposed to byte-level stores. - Tuned the tile dimensions as well as num_warps. - Unfortunately, as opposed to upcast, tensor descriptors were unable to give a consistent performance improvement. I left performance tuning as a TODO for a subsequent PR. ### Performance comparison (BW, in GBps) Done via ``python/triton_kernels/tests/test_mxfp.py``. **Before -- GB200** ``` MXFP8 (e4m3fn): M N quant_dtype quant_bw_bfloat16 quant_bw_float16 dequant_bw_bfloat16 dequant_bw_float16 ---- ---- ------------------- ------------------- ------------------ --------------------- -------------------- 1024 8192 torch.float8_e4m3fn 1985.94 2053.35 2154.61 2347.56 4096 8192 torch.float8_e4m3fn 3479.79 3518.71 3243.02 3753.85 MXFP4 (e2m1): M N quant_dtype quant_bw_bfloat16 quant_bw_float16 dequant_bw_bfloat16 dequant_bw_float16 ---- ---- ------------- ------------------- ------------------ --------------------- -------------------- 1024 8192 torch.uint8 808.089 815.124 647.589 713.9 4096 8192 torch.uint8 1045.23 1041.91 811.089 888.624 ``` **After -- GB200** ``` MXFP8 (e4m3fn): M N quant_dtype quant_bw_bfloat16 quant_bw_float16 dequant_bw_bfloat16 dequant_bw_float16 ---- ---- ------------------- ------------------- ------------------ --------------------- -------------------- 1024 8192 torch.float8_e4m3fn 2259.86 2404.99 2119.76 2361.66 4096 8192 torch.float8_e4m3fn 4106.69 4268.29 4038.16 4059 MXFP4 (e2m1): M N quant_dtype quant_bw_bfloat16 quant_bw_float16 dequant_bw_bfloat16 dequant_bw_float16 ---- ---- ------------- ------------------- ------------------ --------------------- -------------------- 1024 8192 torch.uint8 1334.75 1332.03 1424.7 1397.36 4096 8192 torch.uint8 2027.41 2028.98 2097.15 2275.56 ``` **Before -- H100** ``` MXFP8 (e4m3fn): M N quant_dtype quant_bw_bfloat16 quant_bw_float16 dequant_bw_bfloat16 dequant_bw_float16 ---- ---- ------------------- ------------------- ------------------ --------------------- -------------------- 1024 8192 torch.float8_e4m3fn 1250.29 1244.35 1595.2 1588.75 4096 8192 torch.float8_e4m3fn 1805.81 1799.62 2080.51 2118.34 MXFP4 (e2m1): M N quant_dtype quant_bw_bfloat16 quant_bw_float16 dequant_bw_bfloat16 dequant_bw_float16 ---- ---- ------------- ------------------- ------------------ --------------------- -------------------- 1024 8192 torch.uint8 418.493 416.102 572.367 627.739 4096 8192 torch.uint8 489.531 490.08 687.861 758.08 ``` **After -- H100** ``` MXFP8 (e4m3fn): M N quant_dtype quant_bw_bfloat16 quant_bw_float16 dequant_bw_bfloat16 dequant_bw_float16 ---- ---- ------------------- ------------------- ------------------ --------------------- -------------------- 1024 8192 torch.float8_e4m3fn 1604.96 1624.86 1732.23 1751.52 4096 8192 torch.float8_e4m3fn 2347.56 2337.09 2386.74 2292.8 MXFP4 (e2m1): M N quant_dtype quant_bw_bfloat16 quant_bw_float16 dequant_bw_bfloat16 dequant_bw_float16 ---- ---- ------------- ------------------- ------------------ --------------------- -------------------- 1024 8192 torch.uint8 731.429 745.575 892.861 917.871 4096 8192 torch.uint8 882.343 894.995 1102.37 1165.08 ``` Co-authored-by: jongsoo-openai <jongsoo@openai.com>
diff --git a/python/triton_kernels/tests/test_mxfp.py b/python/triton_kernels/tests/test_mxfp.py
@@ -1,3 +1,4 @@
+import itertools
 from functools import partial
 
 import pytest
@@ -23,19 +24,25 @@ def dtype_str_to_torch(dtype_str: str) -> torch.dtype:
 @pytest.mark.parametrize("dst_dtype", ["float16", "bfloat16", "float32"])
 def test_mxfp4_rounding_cases(dst_dtype, device):
     dst_dtype = dtype_str_to_torch(dst_dtype)
-    x = torch.tensor([6, 0, 0.24, 0.25, 0.75, 0.99, 1.2, 1.3, 1.25, -1.25]).to(device).bfloat16().view(1, -1, 1)
+    two_point_five_plus_ulp = {
+        torch.bfloat16: 0.251953125,
+        torch.float16: 0.250244140625,
+        torch.float32: 0.2500000298023223877,
+    }[dst_dtype]
+    # Construct an example where scale is 1 (when max value is 6.0, the maximum value of e2m1)
+    x = torch.tensor([6, 0, 0.24, 0.25, 0.75, 0.99, 1.2, 1.3, -1.25, two_point_five_plus_ulp], dtype=dst_dtype,
+                     device=device).view(1, -1, 1)
     quant, scale = downcast_to_mxfp(x, torch.uint8, axis=1)
     dequant = upcast_from_mxfp(quant, scale, dst_dtype, axis=1)
     # Tie-breaking cases (RTNE):
     # - 0.25 is exactly halfway between 0.0 and 0.5. RTNE selects the even quantized value 0.0
     #   (binary LSB of target is 0). Rounding away from zero would pick 0.5; towards zero also picks 0.0.
     # - 0.75 is halfway between 0.5 and 1.0. RTNE selects the even value 1.0 (LSB 0). Away-from-zero would pick 1.0;
     #   towards-zero would pick 0.5.
-    # - 1.25 is halfway between 1.0 and 1.5. RTNE selects the even value 1.0. Away-from-zero would pick 1.5;
-    #   towards-zero would pick 1.0.
     # - -1.25 is halfway between -1.0 and -1.5. RTNE selects -1.0 (even). Away-from-zero would pick -1.5;
     #   towards-zero would pick -1.0.
-    assert dequant.flatten().tolist() == [6, 0, 0, 0.0, 1.0, 1.0, 1.0, 1.5, 1.0, -1.0], f"{dequant=}"
+    # - two_point_five_plus_ulp is slightly bigger than 0.25, so it rounds to 0.5.
+    assert dequant.flatten().tolist() == [6, 0, 0, 0.0, 1.0, 1.0, 1.0, 1.5, -1.0, 0.5], f"{dequant=}"
 
     quant_torch, scale_torch = downcast_to_mxfp_torch(x, torch.uint8, axis=1)
     assert_equal(quant_torch, quant)
@@ -153,6 +160,7 @@ def test_mxfp_casting(
 ):
     if "float8" in quant_dtype and (is_cuda() and torch.cuda.get_device_capability()[0] < 9):
         pytest.skip("Float8 not tested on A100")
+    torch.manual_seed(0)
     quant_torch_type = dtype_str_to_torch(quant_dtype)
     dequant_torch_type = dtype_str_to_torch(dequant_dtype)
     # Generate random input tensor that is contiguous once axis is the last dimension
@@ -220,15 +228,32 @@ def _benchmark_mxfp_dequantization(shape, src_quant_dtype: torch.dtype, target_d
     ]
 
     table = []
-    for shape, dtype in tests:
-        mxfp8_q_bw = _benchmark_mxfp_quantization(shape, dtype, torch.float8_e4m3fn)
-        mxfp8_dq_bw = _benchmark_mxfp_dequantization(shape, torch.float8_e4m3fn, dtype)
-        mxfp4_q_bw = _benchmark_mxfp_quantization(shape, dtype, torch.uint8)
-        mxfp4_dq_bw = _benchmark_mxfp_dequantization(shape, torch.uint8, dtype)
-        table.append(shape + (dtype, mxfp8_q_bw, mxfp8_dq_bw, mxfp4_q_bw, mxfp4_dq_bw))
+    shapes = [(1024, 8192), (4096, 8192)]
+    source_dtypes = [torch.bfloat16, torch.float16]
+    for shape, quant_dtype in itertools.product(shapes, [torch.float8_e4m3fn, torch.uint8]):
+        results = [*shape, quant_dtype]
+        for src_dtype in source_dtypes:
+            results.append(_benchmark_mxfp_quantization(shape, src_dtype, quant_dtype))
+        for src_dtype in source_dtypes:
+            results.append(_benchmark_mxfp_dequantization(shape, quant_dtype, src_dtype))
+        table.append(results)
 
     from tabulate import tabulate
-    print(
-        tabulate(
-            table,
-            headers=["M", "N", "dtype", "mxfp8_quant_bw", "mxfp8_dequant_bw", "mxfp4_quant_bw", "mxfp4_dequant_bw"]))
+
+    headers = [
+        "M",
+        "N",
+        "quant_dtype",
+        "quant_bw_bfloat16",
+        "quant_bw_float16",
+        "dequant_bw_bfloat16",
+        "dequant_bw_float16",
+    ]
+    mxfp8_rows = [row for row in table if row[2] == torch.float8_e4m3fn]
+    mxfp4_rows = [row for row in table if row[2] == torch.uint8]
+
+    print("MXFP8 (e4m3fn):")
+    print(tabulate(mxfp8_rows, headers=headers))
+    print()
+    print("MXFP4 (e2m1):")
+    print(tabulate(mxfp4_rows, headers=headers))
diff --git a/python/triton_kernels/triton_kernels/numerics_details/mxfp.py b/python/triton_kernels/triton_kernels/numerics_details/mxfp.py
@@ -7,6 +7,7 @@
 import torch.nn.functional as F
 from .mxfp_details._upcast_from_mxfp import _upcast_from_mxfp
 from .mxfp_details._downcast_to_mxfp import _downcast_to_mxfp, MXFP_BLOCK_SIZE, _quantize_mxfp8_fn
+from triton.tools.tensor_descriptor import TensorDescriptor
 
 # -----------------------------------------------------------------------------
 #                      Dequantization / Quantization Utilities
@@ -20,7 +21,6 @@ class DequantScaleRoundingMode(Enum):
     # chance of clipping the max value.
     ROUND_DOWN = 1
 
-
 def downcast_to_mxfp(src_tensor: torch.Tensor, out_quant_type: torch.dtype, axis: int,
                      DEQUANT_SCALE_ROUNDING_MODE: DequantScaleRoundingMode = DequantScaleRoundingMode.ROUND_UP):
     """
@@ -44,26 +44,40 @@ def downcast_to_mxfp(src_tensor: torch.Tensor, out_quant_type: torch.dtype, axis
     L = src_tensor.shape[-1]
     if is_fp4:
         assert L % 2 == 0, f"axis dim must be divisible by 2 for e2m1. Got {L}"
-    out_shape = src_tensor.shape[:-1] + (L // divisor, )
+    # Ensure last dimension is a multiple of MXFP_BLOCK_SIZE. This is expected by the kernel.
+    padded_L = triton.cdiv(L, MXFP_BLOCK_SIZE.value) * MXFP_BLOCK_SIZE.value
+    needs_padding = padded_L != L
+    out_shape_padded = src_tensor.shape[:-1] + (padded_L // divisor, )
     out_scale_shape = src_tensor.shape[:-1] + (triton.cdiv(L, MXFP_BLOCK_SIZE), )
 
-    out_quant_tensor = src_tensor.new_empty(out_shape, dtype=out_quant_type)
+    out_quant_tensor = src_tensor.new_empty(out_shape_padded, dtype=out_quant_type)
     out_scale = src_tensor.new_empty(out_scale_shape, dtype=torch.uint8)
 
     if src_tensor.numel() > 0:
-        kernel_src_tensor = src_tensor.reshape(-1, src_tensor.shape[-1])
+        src_tensor_padded = F.pad(src_tensor, (0, padded_L - L)) if needs_padding else src_tensor
+        kernel_src_tensor = src_tensor_padded.reshape(-1, src_tensor_padded.shape[-1])
         kernel_quant_tensor = out_quant_tensor.view(-1, out_quant_tensor.shape[-1])
         kernel_scale = out_scale.view(-1, out_scale.shape[-1])
 
-        BLOCK_OUT_DIM = 128
-        BLOCK_QUANT_DIM = MXFP_BLOCK_SIZE.value
-        grid_out = triton.cdiv(kernel_src_tensor.shape[0], BLOCK_OUT_DIM)
-        grid_quant = triton.cdiv(kernel_src_tensor.shape[1], BLOCK_QUANT_DIM)
-
-        _downcast_to_mxfp[(grid_out, grid_quant)](kernel_quant_tensor, *kernel_quant_tensor.stride(), kernel_scale,
-                                                *kernel_scale.stride(), kernel_src_tensor, *kernel_src_tensor.stride(),
-                                                *kernel_src_tensor.shape, BLOCK_OUT_DIM, BLOCK_QUANT_DIM,
-                                                DEQUANT_SCALE_ROUNDING_MODE.value, num_warps=8)
+        # performance hyper-parameters
+        BLOCK_OUT_DIM = 32
+        BLOCK_QUANT_DIM = MXFP_BLOCK_SIZE.value * 4
+        NUM_WARPS = 4 if src_tensor.dtype == torch.float32 else 8
+
+        blocks_out_dim = triton.cdiv(kernel_src_tensor.shape[0], BLOCK_OUT_DIM)
+        blocks_quant_dim = triton.cdiv(kernel_src_tensor.shape[1], BLOCK_QUANT_DIM)
+        _downcast_to_mxfp[(blocks_out_dim, blocks_quant_dim)](
+            kernel_quant_tensor, *kernel_quant_tensor.stride(),
+            kernel_scale, *kernel_scale.stride(),
+            kernel_src_tensor, *kernel_src_tensor.stride(), *kernel_src_tensor.shape,
+            BLOCK_OUT_DIM,
+            BLOCK_QUANT_DIM,
+            DEQUANT_SCALE_ROUNDING_MODE.value,
+            num_warps=NUM_WARPS,
+        )
+
+        if needs_padding:
+            out_quant_tensor = out_quant_tensor[..., : (L // divisor)]
 
     out_quant_tensor = out_quant_tensor.transpose(axis, src_tensor.ndim - 1)
     out_scale = out_scale.transpose(axis, src_tensor.ndim - 1)
@@ -89,23 +103,56 @@ def upcast_from_mxfp(tensor: torch.Tensor, scale: torch.Tensor, target_dtype: to
     assert scale.dtype == torch.uint8, f"Invalid scale dtype {scale.dtype=}"
     assert target_dtype in (torch.float16, torch.bfloat16, torch.float32), f"Invalid output dtype {target_dtype=}"
     # upcast
-    logical_quant_dim = tensor.shape[axis] * (2 if tensor.dtype == torch.uint8 else 1)
+    pack_multiple = 2 if tensor.dtype == torch.uint8 else 1
+    logical_quant_dim = tensor.shape[axis] * pack_multiple
     tensor = tensor.transpose(axis, tensor.ndim - 1).contiguous()
     scale = scale.transpose(axis, scale.ndim - 1).contiguous()
-    out = torch.empty((*tensor.shape[:-1], logical_quant_dim), dtype=target_dtype, device=tensor.device)
+    original_out_shape = tensor.shape[:-1] + (logical_quant_dim, )
 
     if tensor.numel() > 0:
-        reshaped_out = out.view(-1, out.shape[-1])
         reshaped_tensor = tensor.view(-1, tensor.shape[-1])
         reshaped_scale = scale.view(-1, scale.shape[-1])
-        BLOCK_OUT_DIM = 128
-        BLOCK_QUANT_DIM = MXFP_BLOCK_SIZE.value
+
+        # Pad the tensor and output if needed for tensor descriptor spec requirements.
+        TENSOR_DESC_PAD_REQ = 16
+        needs_padding = reshaped_tensor.shape[-1] % TENSOR_DESC_PAD_REQ != 0
+        if needs_padding:
+            tensor_pad_amount = TENSOR_DESC_PAD_REQ - (reshaped_tensor.shape[-1] % TENSOR_DESC_PAD_REQ)
+            reshaped_tensor = F.pad(reshaped_tensor, (0, tensor_pad_amount), "constant", 0)
+            pad_elems_count = tensor_pad_amount * pack_multiple
+            out_shape = original_out_shape[:-1] + (original_out_shape[-1] + pad_elems_count, )
+        else:
+            out_shape = original_out_shape
+        out = torch.empty(out_shape, dtype=target_dtype, device=tensor.device)
+        reshaped_out = out.view(-1, out.shape[-1])
+
+        is_fp4 = reshaped_tensor.dtype == torch.uint8
+
+        # performance hyper-parameters
+        BLOCK_OUT_DIM = 64
+        BLOCK_QUANT_DIM = MXFP_BLOCK_SIZE.value * 4
+        NUM_WARPS = 4
+
         blocks_out_dim = triton.cdiv(reshaped_out.shape[0], BLOCK_OUT_DIM)
         blocks_quant_dim = triton.cdiv(reshaped_out.shape[1], BLOCK_QUANT_DIM)
-        _upcast_from_mxfp[(blocks_out_dim, blocks_quant_dim)](reshaped_out, *reshaped_out.stride(), reshaped_scale,
-                                                              *reshaped_scale.stride(), reshaped_tensor,
-                                                              *reshaped_tensor.stride(), *reshaped_out.shape, BLOCK_OUT_DIM,
-                                                              BLOCK_QUANT_DIM, num_warps=8)
+        k_divisor = 2 if is_fp4 else 1
+        block_size_quant_mx_tensor = BLOCK_QUANT_DIM // k_divisor
+        out_desc = TensorDescriptor.from_tensor(reshaped_out, [BLOCK_OUT_DIM, BLOCK_QUANT_DIM])
+        tensor_desc = TensorDescriptor.from_tensor(reshaped_tensor, [BLOCK_OUT_DIM, block_size_quant_mx_tensor])
+        _upcast_from_mxfp[(blocks_out_dim, blocks_quant_dim)](
+            out_desc,
+            tensor_desc,
+            reshaped_scale,
+            *reshaped_scale.stride(),
+            *reshaped_out.shape,
+            BLOCK_OUT_DIM,
+            BLOCK_QUANT_DIM,
+            num_warps=NUM_WARPS,
+        )
+        if needs_padding:
+            out = out[..., :original_out_shape[-1]]
+    else:
+        out = torch.empty(original_out_shape, dtype=target_dtype, device=tensor.device)
     out = out.transpose(axis, scale.ndim - 1).contiguous()
     return out
 
@@ -218,19 +265,25 @@ def downcast_to_mxfp_torch(src_tensor: torch.Tensor, out_quant_type: torch.dtype
         # Extract sign, exponent, and mantissa.
         signs = q_int & 0x80000000
         exponents = right_shift_unsigned(q_int, 23) & 0xFF
-        mantissas = q_int & 0x7FFFFF
+        mantissas_orig = q_int & 0x7FFFFF
 
         E8_BIAS = 127
         E2_BIAS = 1
         # Adjust mantissas for subnormals.
-        mantissas = torch.where(exponents < E8_BIAS, (0x400000 | right_shift_unsigned(mantissas, 1)) >>
-                                (E8_BIAS - exponents - 1), mantissas)
+        is_subnormal = exponents < E8_BIAS
+        shift = E8_BIAS - exponents - 1
+        mantissas_pre = (0x400000 | right_shift_unsigned(mantissas_orig, 1))
+        bit0_dropped = (mantissas_orig & 0x1) != 0
+        mask = (1 << shift.clamp(max=31)) - 1
+        dropped_post = (mantissas_pre & mask) != 0
+        sticky = is_subnormal & (bit0_dropped | dropped_post)
+        mantissas = torch.where(is_subnormal, mantissas_pre >> shift, mantissas_orig)
         exponents = torch.maximum(exponents, torch.tensor(E8_BIAS - E2_BIAS, device=device)) - (E8_BIAS - E2_BIAS)
         # Round to nearest, ties to even (RTNE)
         m2bits = right_shift_unsigned(mantissas, 21) & 0x3
         lsb_keep = right_shift_unsigned(m2bits, 1) & 0x1
         guard = m2bits & 0x1
-        sticky = (mantissas & ((1 << 21) - 1)) != 0
+        sticky |= (mantissas & ((1 << 21) - 1)) != 0
         round_inc = guard & (sticky.to(torch.int32) | lsb_keep)
         e2m1_tmp = right_shift_unsigned(((exponents << 2) | m2bits) + round_inc, 1)
         e2m1_tmp = torch.minimum(e2m1_tmp, torch.tensor(0x7, device=device))
diff --git a/python/triton_kernels/triton_kernels/numerics_details/mxfp_details/_downcast_to_mxfp.py b/python/triton_kernels/triton_kernels/numerics_details/mxfp_details/_downcast_to_mxfp.py
@@ -1,5 +1,6 @@
 import triton
 import triton.language as tl
+from triton_kernels.target_info import cuda_capability_geq
 
 # fmt: off
 
@@ -72,18 +73,42 @@ def _compute_quant_and_scale(src_tensor, valid_src_mask, mx_tensor_dtype: tl.con
     # Now we must convert the tensors to the mx format.
     if is_fp8:
         out_tensor = quant_tensor.to(mx_tensor_dtype)
+    elif cuda_capability_geq(10, 0):
+        # Convert scaled values to two f32 lanes and use PTX cvt to e2m1x2 with two f32 operands.
+        pairs = tl.reshape(quant_tensor, [BLOCK_SIZE_OUT_DIM, BLOCK_SIZE_QUANT_DIM // 2, 2])
+        lo_f, hi_f = tl.split(pairs)
+        lo_f32 = lo_f.to(tl.float32)
+        hi_f32 = hi_f.to(tl.float32)
+
+        # Inline PTX: cvt.rn.satfinite.e2m1x2.f32 takes two f32 sources and produces one .b8 packed e2m1x2.
+        out_tensor = tl.inline_asm_elementwise(
+            """
+            {
+                .reg .b8 r;
+                cvt.rn.satfinite.e2m1x2.f32 r, $1, $2;
+                mov.b32 $0, {r, r, r, r};
+            }
+            """,
+            constraints="=r,f,f",
+            args=[hi_f32, lo_f32],
+            dtype=tl.uint8,
+            is_pure=True,
+            pack=1,
+        )
     else:
         quant_tensor = quant_tensor.to(tl.uint32, bitcast=True)
         signs = quant_tensor & 0x80000000
         exponents = (quant_tensor >> 23) & 0xFF
-        mantissas = (quant_tensor & 0x7FFFFF)
+        mantissas_orig = (quant_tensor & 0x7FFFFF)
 
         # For RTNE: 0.25 < x < 0.75 maps to 0.5 (denormal); exactly 0.25 maps to 0.0
         E8_BIAS = 127
         E2_BIAS = 1
         # Move implicit bit 1 at the beginning to mantissa for denormals
+        is_subnormal = exponents < E8_BIAS
         adjusted_exponents = tl.core.sub(E8_BIAS, exponents + 1, sanitize_overflow=False)
-        mantissas = tl.where(exponents < E8_BIAS, (0x400000 | (mantissas >> 1)) >> adjusted_exponents, mantissas)
+        mantissas_pre = (0x400000 | (mantissas_orig >> 1))
+        mantissas = tl.where(is_subnormal, mantissas_pre >> adjusted_exponents, mantissas_orig)
 
         # For normal numbers, we change the bias from 127 to 1, and for subnormals, we keep exponent as 0.
         exponents = tl.maximum(exponents, E8_BIAS - E2_BIAS) - (E8_BIAS - E2_BIAS)
@@ -93,7 +118,15 @@ def _compute_quant_and_scale(src_tensor, valid_src_mask, mx_tensor_dtype: tl.con
         m2bits = mantissas >> 21
         lsb_keep = (m2bits >> 1) & 0x1
         guard = m2bits & 0x1
-        sticky = ((mantissas & 0x1FFFFF) != 0).to(tl.uint32)
+        IS_SRC_FP32: tl.constexpr = src_tensor.dtype == tl.float32
+        if IS_SRC_FP32:
+            bit0_dropped = (mantissas_orig & 0x1) != 0
+            mask = (1 << tl.minimum(adjusted_exponents, 31)) - 1
+            dropped_post = (mantissas_pre & mask) != 0
+            sticky = is_subnormal & (bit0_dropped | dropped_post)
+            sticky |= ((mantissas & 0x1FFFFF) != 0).to(tl.uint32)
+        else:
+            sticky = ((mantissas & 0x1FFFFF) != 0).to(tl.uint32)
         round_inc = guard & (sticky | lsb_keep)
         e2m1_tmp = tl.minimum((((exponents << 2) | m2bits) + round_inc) >> 1, 0x7)
         e2m1_value = ((signs >> 28) | e2m1_tmp).to(tl.uint8)
@@ -105,12 +138,14 @@ def _compute_quant_and_scale(src_tensor, valid_src_mask, mx_tensor_dtype: tl.con
     return out_tensor, dequant_scale_exponent
 
 @triton.jit
-def _downcast_to_mxfp(mx_tensor_ptr, stride_mxt_outer, stride_mxt_quant: tl.constexpr,
-                      mx_scale_ptr, stride_mx_scale_outer, stride_mx_scale_quant,
-                      src_ptr, stride_src_outer, stride_src_quant,
-                      outer_dim, quant_dim,
-                      BLOCK_SIZE_OUT_DIM: tl.constexpr, BLOCK_SIZE_QUANT_DIM: tl.constexpr,
-                      DEQUANT_SCALE_ROUNDING_MODE: tl.constexpr):
+def _downcast_to_mxfp(
+    mx_tensor_ptr, stride_mxt_outer, stride_mxt_quant: tl.constexpr,
+    mx_scale_ptr, stride_mx_scale_outer, stride_mx_scale_quant,
+    src_ptr, stride_src_outer, stride_src_quant, outer_dim, quant_dim,
+    BLOCK_SIZE_OUT_DIM:tl.constexpr,
+    BLOCK_SIZE_QUANT_DIM: tl.constexpr,
+    DEQUANT_SCALE_ROUNDING_MODE: tl.constexpr,
+):
 
     tl.static_assert(stride_mxt_quant == 1, f"Output stride, {stride_mxt_quant=} must be 1.")
     tl.static_assert(BLOCK_SIZE_QUANT_DIM % MXFP_BLOCK_SIZE == 0, f"{BLOCK_SIZE_QUANT_DIM=} must be a multiple of 32")
@@ -150,10 +185,10 @@ def _downcast_to_mxfp(mx_tensor_ptr, stride_mxt_outer, stride_mxt_quant: tl.cons
     mask_n = start_out + offs_outer < outer_dim
     full_mask_src = mask_src_quant & mask_n
 
-    mask_mxt_quant = start_mx_quant + offs_mxt_quant < tl.cdiv(quant_dim, K_DIVISOR)
+    mask_mxt_quant = start_mx_quant + offs_mxt_quant < quant_dim // K_DIVISOR  # requires quant_dim % K_DIVISOR == 0
     full_mask_mxt = mask_mxt_quant & mask_n
 
-    scale_mask_k = start_mx_scale_quant + offs_scale_quant < tl.cdiv(quant_dim, MXFP_BLOCK_SIZE)
+    scale_mask_k = start_mx_scale_quant + offs_scale_quant < quant_dim // MXFP_BLOCK_SIZE  # requires quant_dim % MXFP_BLOCK_SIZE == 0
     full_scale_mask = scale_mask_k & mask_n
 
     src_tensor_offsets = offs_src_quant * stride_src_quant + offs_outer * stride_src_outer
diff --git a/python/triton_kernels/triton_kernels/numerics_details/mxfp_details/_upcast_from_mxfp.py b/python/triton_kernels/triton_kernels/numerics_details/mxfp_details/_upcast_from_mxfp.py