[NVBUG: 5373030] Disable the weight adjustment for int32 bias from onnxruntime (NVIDIA#510)

ajrasane · web-flow · commit fcbdc315aa07 · 2025-11-05T11:19:50.000-08:00
## What does this PR do? **Type of change:** Bug Fix **Overview:** - Disable the weight adjustment for int32 bias in onnxruntime by default ## Usage ```python python -m modelopt.onnx.quantization --onnx_path=code031_gemm_batch.onnx --simplify --calibration_eps trt --quantize_mode fp8 --disable_mha_qdq --high_precision_dtype fp16 ``` ## Testing Able to quantize the code031_gemm_batch.onnx model ## Before your PR is "*Ready for review*"  - **Make sure you read and follow [Contributor guidelines](https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/CONTRIBUTING.md)** and your commits are signed. - **Is this change backward compatible?**: Yes - **Did you write any new necessary tests?**: No - **Did you add or update any necessary documentation?**: Yes - **Did you update [Changelog](https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/CHANGELOG.rst)?**: No --------- Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com>
diff --git a/modelopt/onnx/quantization/fp8.py b/modelopt/onnx/quantization/fp8.py
@@ -272,6 +272,10 @@ def quantize(
                 trt_guided_options["group_qdq_tensors"] = group_qdq_tensors
                 logger.debug(f"Grouping QDQ tensors for concat elimination: {group_qdq_tensors}")
 
+        # Add disable_int32_weight_adjustment flag to extra options
+        trt_guided_options["QDQDisableWeightAdjustForInt32Bias"] = True
+        logger.debug("Disabled weight adjustment for INT32 bias in QDQ quantization")
+
         # Create a temp file for intermediate model
         tmp_onnx_file, tmp_onnx_path = tempfile.mkstemp(suffix=".onnx")
         os.close(tmp_onnx_file)
diff --git a/modelopt/onnx/quantization/int8.py b/modelopt/onnx/quantization/int8.py
@@ -237,6 +237,10 @@ def quantize(
                 trt_guided_options["group_qdq_tensors"] = group_qdq_tensors
                 logger.debug(f"Found {len(group_qdq_tensors)} tensor groups for concat elimination")
 
+        # Add disable_int32_weight_adjustment flag to extra options
+        trt_guided_options["QDQDisableWeightAdjustForInt32Bias"] = True
+        logger.debug("Disabled weight adjustment for INT32 bias in QDQ quantization")
+
         # Create a temp file for intermediate model
         tmp_onnx_file, tmp_onnx_path = tempfile.mkstemp(suffix=".onnx")
         os.close(tmp_onnx_file)
diff --git a/modelopt/onnx/quantization/ort_patching.py b/modelopt/onnx/quantization/ort_patching.py
@@ -1600,6 +1600,7 @@ def _quantize_static(
         ("TrtExtraPluginLibraryPaths", "trt_extra_plugin_lib_paths"),
         ("ExecutionProviders", "execution_providers"),
         ("group_qdq_tensors", "group_qdq_tensors"),
+        ("QDQDisableWeightAdjustForInt32Bias", "disable_int32_weight_adjustment"),
         # ==========================================================
     ]
     calib_extra_options = {

Original file line number	Diff line number	Diff line change
`@@ -1600,6 +1600,7 @@ def _quantize_static(`
`1600`	`1600`	`("TrtExtraPluginLibraryPaths", "trt_extra_plugin_lib_paths"),`
`1601`	`1601`	`("ExecutionProviders", "execution_providers"),`
`1602`	`1602`	`("group_qdq_tensors", "group_qdq_tensors"),`
	`1603`	`+ ("QDQDisableWeightAdjustForInt32Bias", "disable_int32_weight_adjustment"),`
`1603`	`1604`	`# ==========================================================`
`1604`	`1605`	`]`
`1605`	`1606`	`calib_extra_options = {`