[OMNIML-2336] make w4a8_nvfp4_fp8's scale factor in range of 448/6

sychen52 · sychen52 · commit 5d3b2e801cad · 2025-10-07T14:12:57.000-07:00
Signed-off-by: Shiyang Chen &lt;shiychen@nvidia.com&gt;
diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py
@@ -295,9 +295,10 @@ def get_weight_scaling_factor_2(module: nn.Module, weight_name: str = "weight")
     if get_quantization_format(module) in [
         QUANTIZATION_NVFP4,
         QUANTIZATION_NVFP4_AWQ,
-        QUANTIZATION_W4A8_NVFP4_FP8,
     ]:
         return NVFP4QTensor.get_weights_scaling_factor_2_from_quantizer(weight_quantizer)
+    elif get_quantization_format(module) == QUANTIZATION_W4A8_NVFP4_FP8:
+        return weight_quantizer._amax.float() / 448.0
 
     # SequentialQuantizer is required
     if not isinstance(weight_quantizer, SequentialQuantizer) or not weight_quantizer[-1].is_enabled: