[NVBUG_5703882] Add INT4QuantExporter to llm_export.py (NVIDIA#631)

ajrasane · web-flow · commit 02fb06d1f9a9 · 2025-12-02T13:54:42.000-08:00
## What does this PR do? **Type of change:** Bug Fix **Overview:** - Added Int4QuantExporter to llm_export.py example - Added E2E integration test for llm_export.py ## Testing ``` python llm_export.py --torch_dir=Qwen/Qwen2-0.5B-Instruct --dtype=fp8 --lm_head=fp16 --output_dir=./qwen2-0.5B-Instruct --calib_size=64 --trust_remote_code ``` ## Before your PR is "*Ready for review*"  - **Make sure you read and follow [Contributor guidelines](https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/CONTRIBUTING.md)** and your commits are signed. - **Is this change backward compatible?**: Yes - **Did you write any new necessary tests?**: No - **Did you add or update any necessary documentation?**: No - **Did you update [Changelog](https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/CHANGELOG.rst)?**: No  --------- Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com>
diff --git a/examples/onnx_ptq/llm_export.py b/examples/onnx_ptq/llm_export.py
@@ -30,14 +30,15 @@
 from transformers import AutoConfig, AutoTokenizer
 
 import modelopt
+from modelopt.onnx.export import INT4QuantExporter
 from modelopt.onnx.llm_export_utils.export_utils import (
     ModelLoader,
     WrapperModelForCausalLM,
     llm_to_onnx,
 )
 from modelopt.onnx.llm_export_utils.quantization_utils import quantize
 from modelopt.onnx.llm_export_utils.surgeon_utils import fold_fp8_qdq_to_dq
-from modelopt.onnx.quantization.qdq_utils import fp4qdq_to_2dq, quantize_weights_to_int4
+from modelopt.onnx.quantization.qdq_utils import fp4qdq_to_2dq
 from modelopt.torch.export import export_hf_checkpoint
 from modelopt.torch.quantization.utils import is_quantized_linear
 
@@ -278,7 +279,7 @@ def time_operation(operation_name):
 
     elif dtype == "int4_awq":
         with time_operation("quantizing weights to int4"):
-            onnx_model = quantize_weights_to_int4(onnx_model)
+            onnx_model = INT4QuantExporter.process_model(onnx_model)
 
     output_onnx_name = f"{output_dir}/model.onnx"
     print(
diff --git a/modelopt/onnx/export/int4_exporter.py b/modelopt/onnx/export/int4_exporter.py
@@ -35,7 +35,7 @@ def pre_process(onnx_model: onnx.ModelProto) -> onnx.ModelProto:
         graph = onnx_model.graph
         value_info_map = {value_info.name: value_info for value_info in graph.value_info}
         weight_dq_nodes = [node for node in graph.node if node.op_type == "DequantizeLinear"]
-        tensor_producer_map = get_tensor_producer_nodes(graph)
+        tensor_producer_map = get_tensor_producer_nodes(graph, get_initializer_producers=True)
 
         nodes_to_remove = []
         for node in weight_dq_nodes:
@@ -126,7 +126,7 @@ def compute_scales(onnx_model: onnx.ModelProto) -> onnx.ModelProto:
         graph = onnx_model.graph
         initializer_map = {initializer.name: initializer for initializer in graph.initializer}
         weight_dq_nodes = [node for node in graph.node if node.op_type == "DequantizeLinear"]
-        tensor_producer_map = get_tensor_producer_nodes(graph)
+        tensor_producer_map = get_tensor_producer_nodes(graph, get_initializer_producers=True)
 
         for node in weight_dq_nodes:
             weight_name = node.input[0]
diff --git a/modelopt/onnx/quantization/graph_utils.py b/modelopt/onnx/quantization/graph_utils.py
@@ -236,6 +236,7 @@ def get_tensor_from_name(graph: onnx.GraphProto, tensor_name: str) -> onnx.Value
 
 def get_tensor_producer_nodes(
     graph: onnx.GraphProto,
+    get_initializer_producers: bool = False,
 ) -> dict[str, onnx.NodeProto]:
     """Returns a dictionary of tensor name and their producer node object mapping.
 
@@ -272,6 +273,10 @@ def get_tensor_producer_nodes(
         for output_name in node.output:
             tensor_producers[output_name] = node
 
+    if get_initializer_producers:
+        for initializer in graph.initializer:
+            tensor_producers[initializer.name] = initializer
+
     return tensor_producers
 
 
diff --git a/tests/examples/onnx_ptq/test_llm_export.py b/tests/examples/onnx_ptq/test_llm_export.py
@@ -0,0 +1,37 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import pytest
+from _test_utils.examples.run_command import run_onnx_llm_export_command
+
+
+@pytest.mark.parametrize(
+    ("torch_dir", "dtype", "lm_head", "output_dir", "calib_size"),
+    [
+        ("Qwen/Qwen2-0.5B-Instruct", "fp16", "fp16", "/tmp/qwen2-0.5b-instruct-fp16", "1"),
+        ("Qwen/Qwen2-0.5B-Instruct", "fp8", "fp16", "/tmp/qwen2-0.5b-instruct-fp8", "1"),
+        ("Qwen/Qwen2-0.5B-Instruct", "int4_awq", "fp16", "/tmp/qwen2-0.5b-instruct-int4_awq", "1"),
+        ("Qwen/Qwen2-0.5B-Instruct", "nvfp4", "fp16", "/tmp/qwen2-0.5b-instruct-nvfp4", "1"),
+    ],
+)
+def test_llm_export_onnx(torch_dir, dtype, lm_head, output_dir, calib_size):
+    run_onnx_llm_export_command(
+        torch_dir=torch_dir,
+        dtype=dtype,
+        lm_head=lm_head,
+        output_dir=output_dir,
+        calib_size=calib_size,
+    )