[OMNIML-2336] make w4a8_nvfp4_fp8's scale factor in range of 448/6

sychen52 · sychen52 · commit d4acc9d06828 · 2025-10-07T13:06:49.000-07:00
Signed-off-by: Shiyang Chen &lt;shiychen@nvidia.com&gt;
diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py
@@ -298,6 +298,8 @@ def get_weight_scaling_factor_2(module: nn.Module, weight_name: str = "weight")
         QUANTIZATION_W4A8_NVFP4_FP8,
     ]:
         return NVFP4QTensor.get_weights_scaling_factor_2_from_quantizer(weight_quantizer)
+    elif get_quantization_format(module) == QUANTIZATION_W4A8_NVFP4_FP8:
+        return weight_quantizer._amax.float() / 448.0
 
     # SequentialQuantizer is required
     if not isinstance(weight_quantizer, SequentialQuantizer) or not weight_quantizer[-1].is_enabled: