[mxfp] adjust num_stages for bf16/fp16 x mxfp (#8773)

jongsoo-openai · web-flow · commit 3d33f743a396 · 2025-12-09T13:34:26.000-08:00
For fp16/bf16 x mxfp, we upcast weight on the fly, so we should size smem_capacity accordingly. w/o thischange , gets the following error: "triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263356, Hardware limit: 232448. Reducing block sizes or `num_stages` may help" for x.shape = [2048, 5120] bf16 x [32, 5120, 5120] float8_e4m3fn block_m=64, block_n=256, block_k=128, split_k=1, is_persistent=True -> leading to num_stages=4 # New contributor declaration - [x] I am not making a trivial change, such as fixing a typo in a comment. - [x] I have written a PR description following these [rules](https://cbea.ms/git-commit/#why-not-how). - [ ] I have run `pre-commit run --from-ref origin/main --to-ref HEAD`. - Select one of the following. - [ ] I have added tests. - `/test` for `lit` tests - `/unittest` for C++ tests - `/python/test` for end-to-end tests - [x] This PR does not need a test because wasn't able to find a shape that runs reliably w/o OOMs. The example shape above 32 x 5120 x 5120 is too big. Will try to see if I can enable only on GB200. - Select one of the following. - [x] I have not added any `lit` tests. - [ ] The `lit` tests I have added follow these [best practices](https://mlir.llvm.org/getting_started/TestingGuide/#filecheck-best-practices), including the "tests should be minimal" section. (Usually running Python code and using the instructions it generates is not minimal.)
diff --git a/python/triton_kernels/triton_kernels/matmul_details/opt_flags_details/opt_flags_nvidia.py b/python/triton_kernels/triton_kernels/matmul_details/opt_flags_details/opt_flags_nvidia.py
@@ -1,9 +1,10 @@
+import warnings
+
 import torch
 import triton
 from triton_kernels import target_info
 from triton_kernels.numerics_details.mxfp_details._downcast_to_mxfp import MXFP_BLOCK_SIZE
-from triton_kernels.tensor import FP4, bitwidth, get_layout
-from triton_kernels.tensor import Tensor
+from triton_kernels.tensor import FP4, Tensor, bitwidth, get_layout
 from triton_kernels.tensor_details.layout import HopperMXScaleLayout
 from triton_kernels.tensor_details.layout_details.blackwell_scale import BlackwellActMXScaleLayout
 
@@ -98,6 +99,14 @@ def compute_num_stages(
     if precision_config.max_num_imprecise_acc is not None:
         return 3
     weight_size = bitwidth(rhs_dtype) / 8
+    if precision_config.b_mx_scale is not None and lhs_dtype in [torch.float16, torch.bfloat16]:
+        # For fp16/bf16 x mxfp, we upcast weight on the fly, so size
+        # smem_capacity accordingly.
+        # w/o this, gets the following error:
+        # "triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263356, Hardware limit: 232448. Reducing block sizes or `num_stages` may help"
+        # for x.shape = [2048, >=4096] bf16 x [32, >=4096, >=4096] float8_e4m3fn
+        # block_m=64, block_n=256, block_k=128, split_k=1, is_persistent=True -> leading to num_stages=4
+        weight_size = 2
     stage_size = block_m * block_k * lhs_dtype.itemsize + block_k * block_n * weight_size
     device_props = torch.cuda.get_device_properties(0)
     smem_capacity = device_props.shared_memory_per_block_optin
@@ -132,5 +141,10 @@ def compute_num_stages(
     elif has_native_mxfp:
         # mx scales
         stage_size += block_n * (block_k // int(MXFP_BLOCK_SIZE))
-    num_stages = min(4, smem_capacity // int(stage_size))
+    num_stages = min(smem_capacity // int(stage_size), 4)
+    if num_stages == 0:
+        warnings.warn(f"num_stages computed is 0 with {stage_size=} and {smem_capacity=}, "
+                      "bumping up to 1 but this may lead to out of shared memory errors, "
+                      "and in that case consider reducing block sizes.")
+        num_stages = 1
     return num_stages