Fix onnx_ptq llm_export.py and support Qwen3 (#638)

kevalmorabia97 · web-flow · commit 70746155f11f · 2025-12-04T09:39:55.000+05:30
## What does this PR do?

**Type of change:** Bug Fix

## Testing
&lt;!-- Mention how have you tested your change if applicable. --&gt;

- [x] Tests run locally in docker
- [x] Tests enabled in github per-PR CICD

---------

Signed-off-by: Keval Morabia &lt;28916987+kevalmorabia97@users.noreply.github.com&gt;
diff --git a/.github/workflows/example_tests.yml b/.github/workflows/example_tests.yml
@@ -123,7 +123,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        example: [diffusers]
+        example: [diffusers, onnx_ptq]
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
diff --git a/examples/onnx_ptq/llm_export.py b/examples/onnx_ptq/llm_export.py
@@ -47,9 +47,9 @@ def llm_arguments():
     """Parse the arguments for the llm export script."""
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "--torch_dir",
+        "--hf_model_path",
         type=str,
-        help="The folder of HF PyTorch model ckpt or HuggingFace model name/path (e.g., 'Qwen/Qwen2.5-0.5B-Instruct')",
+        help="The folder of HF PyTorch model ckpt or HuggingFace model name/path (e.g., 'Qwen/Qwen3-0.6B')",
         required=False,
     )
     parser.add_argument(
@@ -110,34 +110,34 @@ def llm_arguments():
 def get_config_path(args):
     """
     Get config.json file path from the arguments.
-    The default priority is: config_path > torch_dir/config.json > onnx_path/../config.json
+    The default priority is: config_path > hf_model_path/config.json > onnx_path/../config.json
     """
     if args.config_path and os.path.exists(args.config_path):
         return args.config_path
-    if args.torch_dir:
-        # Check if torch_dir is a local directory
-        if os.path.isdir(args.torch_dir):
-            torch_config = os.path.join(args.torch_dir, "config.json")
+    if args.hf_model_path:
+        # Check if hf_model_path is a local directory
+        if os.path.isdir(args.hf_model_path):
+            torch_config = os.path.join(args.hf_model_path, "config.json")
             if os.path.exists(torch_config):
                 return torch_config
         else:
             # For HuggingFace model names, download config temporarily
             try:
                 # Download config from HuggingFace
                 config = AutoConfig.from_pretrained(
-                    args.torch_dir, trust_remote_code=args.trust_remote_code
+                    args.hf_model_path, trust_remote_code=args.trust_remote_code
                 )
 
                 # Save to temporary file
                 temp_config_path = os.path.join(
-                    tempfile.gettempdir(), f"config_{args.torch_dir.replace('/', '_')}.json"
+                    tempfile.gettempdir(), f"config_{args.hf_model_path.replace('/', '_')}.json"
                 )
                 with open(temp_config_path, "w") as f:
                     json.dump(config.to_dict(), f, indent=2)
 
                 return temp_config_path
             except Exception as e:
-                print(f"Warning: Could not download config for {args.torch_dir}: {e}")
+                print(f"Warning: Could not download config for {args.hf_model_path}: {e}")
 
     if args.onnx_path:
         onnx_config = os.path.join(os.path.dirname(args.onnx_path), "config.json")
@@ -152,7 +152,7 @@ def export_raw_llm(
     output_dir,
     dtype,
     config_path,
-    torch_dir,
+    hf_model_path,
     lm_head_precision="fp16",
     dataset_dir="",
     wrapper_cls=WrapperModelForCausalLM,
@@ -167,7 +167,7 @@ def export_raw_llm(
         output_dir: str
         dtype: str
         config_path: str
-        torch_dir: str, Used for loading tokenizer for quantization
+        hf_model_path: str, Used for loading tokenizer for quantization
         dataset_dir: str, Used for quantization
         wrapper_cls: class, Used for wrapping the model
         extra_inputs: dict, Used for extra inputs
@@ -187,11 +187,11 @@ def export_raw_llm(
     # Need to quantize model to fp8, int4_awq or nvfp4
     if dtype in ["fp8", "int4_awq", "nvfp4"]:
         tokenizer = AutoTokenizer.from_pretrained(
-            torch_dir, trust_remote_code=args.trust_remote_code
+            hf_model_path, trust_remote_code=args.trust_remote_code
         )
-        # Only check for local modelopt_state if torch_dir is a local directory
-        if os.path.isdir(torch_dir):
-            modelopt_state = os.path.join(torch_dir, "modelopt_state.pth")
+        # Only check for local modelopt_state if hf_model_path is a local directory
+        if os.path.isdir(hf_model_path):
+            modelopt_state = os.path.join(hf_model_path, "modelopt_state.pth")
             model_needs_quantization = not os.path.exists(modelopt_state)
         else:
             # For HuggingFace model names, always quantize as we can't have local state files
@@ -345,8 +345,8 @@ def get_modelopt_version():
 
 def main(args):
     """Main function to export the LLM model to ONNX."""
-    assert args.torch_dir or args.onnx_path, (
-        "You need to provide either --torch_dir or --onnx_path to process the export script."
+    assert args.hf_model_path or args.onnx_path, (
+        "You need to provide either --hf_model_path or --onnx_path to process the export script."
     )
     start_time = time.time()
 
@@ -356,14 +356,11 @@ def main(args):
     if args.onnx_path:
         raw_onnx_path = args.onnx_path
 
-    model_loader = ModelLoader(
-        args.torch_dir,
-        args.config_path,
-    )
+    model_loader = ModelLoader(args.hf_model_path, args.config_path)
 
-    if args.torch_dir:
+    if args.hf_model_path:
         # Exporting ONNX from PyTorch model
-        model = model_loader.load_model()
+        model = model_loader.load_model(trust_remote_code=args.trust_remote_code)
         onnx_dir = args.output_dir + "_raw" if args.save_original else args.output_dir
         # Surgeon graph based on precision
         raw_onnx_path = f"{onnx_dir}/model.onnx"
@@ -373,7 +370,7 @@ def main(args):
             output_dir=onnx_dir,
             dtype=args.dtype,
             config_path=args.config_path,
-            torch_dir=args.torch_dir,
+            hf_model_path=args.hf_model_path,
             lm_head_precision=args.lm_head,
             dataset_dir=args.dataset_dir,
             wrapper_cls=WrapperModelForCausalLM,
diff --git a/modelopt/onnx/llm_export_utils/export_utils.py b/modelopt/onnx/llm_export_utils/export_utils.py
@@ -21,7 +21,7 @@
 from enum import Enum
 
 import torch
-from transformers import DynamicCache
+from transformers import AutoModelForCausalLM, DynamicCache
 
 
 class RopeType(Enum):
@@ -36,10 +36,10 @@ class RopeType(Enum):
 class ModelLoader:
     """A class to handle HuggingFace model loading and configuration."""
 
-    def __init__(self, torch_dir, config_path):
+    def __init__(self, hf_model_path: str, config_path: str):
         """Initialize the ModelLoader."""
         self.config_path = config_path
-        self.torch_dir = torch_dir
+        self.hf_model_path = hf_model_path
         self.model_type = self.get_model_type()
         self.hf_model = None
         self.rope_type = RopeType.K_ROPE_ROTATE_NEOX
@@ -49,16 +49,14 @@ def get_model_type(self):
         with open(self.config_path) as f:
             return json.load(f).get("model_type")
 
-    def load_model(self):
+    def load_model(self, trust_remote_code: bool = False) -> AutoModelForCausalLM:
         """Load HuggingFace model based on model type."""
-        print(f"Loading HF model from {self.torch_dir} with model type {self.model_type}")
-        from transformers import AutoModelForCausalLM
-
+        print(f"Loading HF model from {self.hf_model_path} with model type {self.model_type}")
         self.hf_model = AutoModelForCausalLM.from_pretrained(
-            self.torch_dir, torch_dtype=torch.float16, trust_remote_code=True
+            self.hf_model_path, torch_dtype=torch.float16, trust_remote_code=trust_remote_code
         )
 
-        return self.hf_model.eval().cuda()
+        return self.hf_model.eval().cuda()  # type: ignore[attr-defined]
 
     def get_rope_type(self):
         """Get rope type."""
@@ -78,13 +76,14 @@ def __init__(self, model):
         self.lm_head = model.lm_head
         self.config = model.config
 
-    def forward(
-        self,
-        input_ids,
-        past_key_values,
-    ):
+    def forward(self, input_ids: torch.Tensor | None, past_key_values: tuple):
         """Forward pass."""
-        past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+        # Convert tuple cache to DynamicCache for models that require it (e.g., Qwen3)
+        cache = DynamicCache(config=self.config)
+        cache.key_cache = [kv[0] for kv in past_key_values]
+        cache.value_cache = [kv[1] for kv in past_key_values]
+        past_key_values = cache
+
         outputs = self.model(input_ids=input_ids, past_key_values=past_key_values, use_cache=True)
         hidden_states = outputs[0]
         past_key_values = outputs.past_key_values.to_legacy_cache()
@@ -159,4 +158,5 @@ def torch_to_onnx(model, inputs, onnx_dir, onnx_name, input_names, output_names,
             dynamic_axes=dynamic_axes,
             opset_version=19,
             do_constant_folding=True,
+            dynamo=False,
         )
diff --git a/pyproject.toml b/pyproject.toml
@@ -133,7 +133,7 @@ disable_error_code = ["attr-defined"]
 # Default additional options
 # Show a short test summary info for all except passed tests with -ra flag
 # print execution time for 20 slowest tests and generate coverage reports
-addopts = "-ra --instafail --cov-report=term-missing --cov-report=html --cov-report=xml:coverage.xml --cov-config=pyproject.toml --durations=20 --strict-markers"
+addopts = "-v -ra --instafail --cov-report=term-missing --cov-report=html --cov-report=xml:coverage.xml --cov-config=pyproject.toml --durations=20 --strict-markers"
 pythonpath = ["tests/"]
 markers = [
     "manual: Only run when --run-manual is given",
diff --git a/tests/_test_utils/examples/run_command.py b/tests/_test_utils/examples/run_command.py
@@ -62,22 +62,6 @@ def run_command_in_background(
     return process
 
 
-def run_onnx_llm_export_command(
-    *, torch_dir: str, dtype: str, lm_head: str, output_dir: str, calib_size: str, **kwargs
-):
-    kwargs.update(
-        {
-            "torch_dir": torch_dir,
-            "dtype": dtype,
-            "lm_head": lm_head,
-            "output_dir": output_dir,
-            "calib_size": calib_size,
-        }
-    )
-    cmd_parts = extend_cmd_parts(["python", "llm_export.py"], **kwargs)
-    run_example_command(cmd_parts, "onnx_ptq")
-
-
 def run_llm_ptq_command(*, model: str, quant: str, **kwargs):
     kwargs.update({"model": model, "quant": quant})
     kwargs.setdefault("tasks", "quant")
diff --git a/tests/examples/onnx_ptq/test_llm_export.py b/tests/examples/onnx_ptq/test_llm_export.py
@@ -15,23 +15,25 @@
 
 
 import pytest
-from _test_utils.examples.run_command import run_onnx_llm_export_command
+from _test_utils.examples.run_command import extend_cmd_parts, run_example_command
 
 
 @pytest.mark.parametrize(
-    ("torch_dir", "dtype", "lm_head", "output_dir", "calib_size"),
+    ("hf_model_path", "dtype", "lm_head"),
     [
-        ("Qwen/Qwen2-0.5B-Instruct", "fp16", "fp16", "/tmp/qwen2-0.5b-instruct-fp16", "1"),
-        ("Qwen/Qwen2-0.5B-Instruct", "fp8", "fp16", "/tmp/qwen2-0.5b-instruct-fp8", "1"),
-        ("Qwen/Qwen2-0.5B-Instruct", "int4_awq", "fp16", "/tmp/qwen2-0.5b-instruct-int4_awq", "1"),
-        ("Qwen/Qwen2-0.5B-Instruct", "nvfp4", "fp16", "/tmp/qwen2-0.5b-instruct-nvfp4", "1"),
+        ("Qwen/Qwen2-0.5B-Instruct", "fp16", "fp16"),
+        ("Qwen/Qwen2-0.5B-Instruct", "fp8", "fp16"),
+        ("Qwen/Qwen3-0.6B", "int4_awq", "fp16"),
+        ("Qwen/Qwen3-0.6B", "nvfp4", "fp16"),
     ],
 )
-def test_llm_export_onnx(torch_dir, dtype, lm_head, output_dir, calib_size):
-    run_onnx_llm_export_command(
-        torch_dir=torch_dir,
+def test_llm_export_onnx(tmp_path, hf_model_path, dtype, lm_head):
+    cmd_parts = extend_cmd_parts(
+        ["python", "llm_export.py"],
+        hf_model_path=hf_model_path,
         dtype=dtype,
         lm_head=lm_head,
-        output_dir=output_dir,
-        calib_size=calib_size,
+        output_dir=str(tmp_path),
+        calib_size=1,
     )
+    run_example_command(cmd_parts, "onnx_ptq")