Fix DQ1 output type error in DQ1->DQ2 for FP4 weights in NVFP4 model (#513)

vishalpandya1990 · kevalmorabia97 · commit 957ce07ecd9c · 2025-11-12T12:43:00.000+05:30
## What does this PR do? **Type of change:** Bug Fix **Overview:** - In post-processing after NVFP4 PTQ and ONNX Export, we convert FP4-QDQ into DQ1->DQ2 for FP4 weights of the MatMuls. The output of DQ1 is of the original weight-type (FP16 for FP16 base model) but its scale is in FP32. There is a cast-to-fp16 after DQ2. - In above setting, with FP16 base model weights, DQ1 has x_scale in FP32 but its output type is set to FP16. This hybrid precision mode is not allowed up to opset-21, and thereby it leads to error when run with Onnxruntime. - Note that such hybrid precision mode is allowed in opset-23+ but they are not fully supported with onnxruntime EPs today, and even in future we would want to support opset < 23 too. - So, in this change, setting output of DQ1 to FP32 since its scale is in FP32. There is already a cast-to-fp16 after DQ2 (before Gemm). ## Testing - Checked with trtexec binary and onnxruntime-trt-rtx ep - using sd3.5-medium model, on Windows RTX 5090. ## Before your PR is "*Ready for review*"  - **Make sure you read and follow [Contributor guidelines](https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/CONTRIBUTING.md)** and your commits are signed. - **Is this change backward compatible?**: Yes/No  - **Did you write any new necessary tests?**: Yes/No - **Did you add or update any necessary documentation?**: Yes/No - **Did you update [Changelog](https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/CHANGELOG.rst)?**: Yes/No  ## Additional Information  --------- Signed-off-by: vipandya <vipandya@nvidia.com>
diff --git a/modelopt/onnx/quantization/qdq_utils.py b/modelopt/onnx/quantization/qdq_utils.py
@@ -1317,7 +1317,6 @@ def replace_fp4qdq_with_2dq(
     w_f4: np.ndarray,
     sw_f32_per_tensor: np.ndarray,
     sw_f8_per_block: np.ndarray,
-    precision_dtype: str,
     block_size: int,
 ):
     """Replaces the given node in the ONNX graph with a subgraph consisting of two DequantizeLinear nodes.
@@ -1331,7 +1330,6 @@ def replace_fp4qdq_with_2dq(
         w_f4: NumPy array for w_f4.
         sw_f32_per_tensor: NumPy array for sw_f32_per_tensor.
         sw_f8_per_block: NumPy array for sw_f8_per_block.
-        precision_dtype: The precision of the weights.
         block_size: Block size used in block quantization.
     """
 
@@ -1391,39 +1389,39 @@ def _add_input_value_info(graph, tensor_proto):
     _add_initializer(sw_f32_per_tensor_proto)
     _add_initializer(sw_f8_per_block_proto)
 
-    # Create DequantizeLinear_1 node: (sw_f8_per_block, sw_f32_per_tensor) -> sw_f16
-    sw_f16_name = weight_name + "_f16_scale"
+    # Create DequantizeLinear_1 node: (sw_f8_per_block, sw_f32_per_tensor) -> sw_f32
+    sw_f32_name = weight_name + "_f32_scale"
     dequant1 = onnx.helper.make_node(
         "DequantizeLinear",
         inputs=[sw_f8_per_block_proto.name, sw_f32_per_tensor_proto.name],
-        outputs=[sw_f16_name],
+        outputs=[sw_f32_name],
         name=weight_name + "_DequantizeLinear",
     )
 
-    # Create DequantizeLinear_2 node: (w_f4, sw_f16) -> w_16
-    w16_name = node.output[0]
+    # Create DequantizeLinear_2 node: (w_f4, sw_f32) -> w_32
+    w32_name = node.output[0]
     dequant2 = onnx.helper.make_node(
         "DequantizeLinear",
-        inputs=[w_f4_proto.name, sw_f16_name],
-        outputs=[w16_name],
+        inputs=[w_f4_proto.name, sw_f32_name],
+        outputs=[w32_name],
         name=weight_name + "_DequantizeLinear_1",
         axis=-1,
         block_size=block_size,
     )
 
-    # Add value_info for sw_f16
+    # Add value_info for sw_f32
     # Assuming sw_f16 has the same shape as sw_f8_per_block
-    sw_f16_type_proto = onnx.helper.make_tensor_type_proto(
-        elem_type=onnx_dtype_map[precision_dtype], shape=sw_f8_per_block.shape
+    sw_f32_type_proto = onnx.helper.make_tensor_type_proto(
+        elem_type=onnx_dtype_map["Float"], shape=sw_f8_per_block.shape
     )
-    sw_f16_value_info = onnx.helper.make_value_info(name=sw_f16_name, type_proto=sw_f16_type_proto)
+    sw_f16_value_info = onnx.helper.make_value_info(name=sw_f32_name, type_proto=sw_f32_type_proto)
     graph.value_info.append(sw_f16_value_info)
 
     # Change the data type of w16 (output of 2nd DQ) to model weight precision type
-    if w16_name in value_info_map:
-        value_info_map[w16_name].type.tensor_type.elem_type = onnx_dtype_map[precision_dtype]
+    if w32_name in value_info_map:
+        value_info_map[w32_name].type.tensor_type.elem_type = onnx_dtype_map["Float"]
     else:
-        raise ValueError(f"ValueInfo for {w16_name} not found.")
+        raise ValueError(f"ValueInfo for {w32_name} not found.")
 
     # Add the new nodes to the graph
     graph.node.extend([dequant1, dequant2])
@@ -1522,7 +1520,6 @@ def _get_precision_dtype() -> str:
             w_f4,
             sw_f32_per_tensor,
             sw_f8_per_block,
-            precision_dtype,
             block_size,
         )