Remove export_fmt usage in other tests and readme

kevalmorabia97 · kevalmorabia97 · commit 6a4714690760 · 2025-09-16T22:00:51.000-07:00
Signed-off-by: Keval Morabia &lt;28916987+kevalmorabia97@users.noreply.github.com&gt;
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -50,6 +50,5 @@ modelopt/torch/utils @NVIDIA/modelopt-torch-utils-codeowners
 /examples/onnx_ptq @NVIDIA/modelopt-onnx-codeowners
 /examples/pruning @NVIDIA/modelopt-torch-nas-prune-codeowners
 /examples/speculative_decoding @NVIDIA/modelopt-torch-speculative-codeowners
-/examples/vlm_eval @NVIDIA/modelopt-examples-vlm-codeowners
 /examples/vlm_ptq @NVIDIA/modelopt-examples-vlm-codeowners
 /examples/windows @NVIDIA/modelopt-windows-codeowners
diff --git a/docs/source/getting_started/_installation_for_Linux.rst b/docs/source/getting_started/_installation_for_Linux.rst
@@ -18,7 +18,7 @@ Latest Model Optimizer (``nvidia-modelopt``) currently has the following system
 +-------------------------+-----------------------------+
 | PyTorch                 |  >=2.6                      |
 +-------------------------+-----------------------------+
-| TensorRT-LLM (Optional) |  1.0.0rc6                   |
+| TensorRT-LLM (Optional) |  1.1.0rc2.post2             |
 +-------------------------+-----------------------------+
 | ONNX Runtime (Optional) |  1.22                       |
 +-------------------------+-----------------------------+
diff --git a/examples/llm_ptq/README.md b/examples/llm_ptq/README.md
@@ -251,7 +251,7 @@ scripts/huggingface_example.sh --model $HF_PATH --quant [fp8|nvfp4|int8_sq|int4_
 
 > *If a GPU OOM error occurs during model quantization despite sufficient memory, setting the --use_seq_device_map flag can help. This enforces sequential device mapping, distributing the model across GPUs and utilizing up to 80% of each GPU's memory.*
 
-> *You can now add `--low_memory_mode` to the command when setting `--export_fmt=hf` to lower the memory requirements of the PTQ process. With this mode, the script will compress model weights to low precision before calibration. This mode is only supported for FP8 and NVFP4 with max calibration.*
+> *You can add `--low_memory_mode` to the command to lower the memory requirements of the PTQ process. With this mode, the script will compress model weights to low precision before calibration. This mode is only supported for FP8 and NVFP4 with max calibration.*
 
 #### Deepseek R1
 
@@ -301,15 +301,15 @@ with torch.inference_mode():
 ### Quantize and Export
 
 ```bash
-python hf_ptq.py --pyt_ckpt_path <huggingface_model_card> --qformat fp8 --export_fmt hf --export_path <quantized_ckpt_path> --trust_remote_code
+python hf_ptq.py --pyt_ckpt_path <huggingface_model_card> --qformat fp8 --export_path <quantized_ckpt_path> --trust_remote_code
 ```
 
 ### Hugging Face framework [Script](./scripts/huggingface_example.sh)
 
 Alternatively, the framework script `huggingface_example.sh` also supports quantize and export:
 
 ```bash
-scripts/huggingface_example.sh --model <huggingface_model_card> --quant fp8 --export_fmt hf
+scripts/huggingface_example.sh --model <huggingface_model_card> --quant fp8
 ```
 
 ### Deployment
diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
@@ -748,7 +748,7 @@ def output_decode(generated_ids, input_shape):
     args = parser.parse_args()
 
     if args.export_fmt != "hf":
-        warnings.warn("Deprecated. --export_fmt will be ignored.")
+        warnings.warn("Deprecated. --export_fmt forced to hf.")
 
     args.dataset = args.dataset.split(",") if args.dataset else None
     args.calib_size = [int(num_sample) for num_sample in args.calib_size.split(",")]
diff --git a/examples/llm_qat/notebooks/QAT_QAD_Walkthrough.ipynb b/examples/llm_qat/notebooks/QAT_QAD_Walkthrough.ipynb
@@ -691,7 +691,7 @@
     "\n",
     "# run conversion script\n",
     "cd ..\n",
-    "bash TensorRT-Model-Optimizer/examples/llm_ptq/scripts/huggingface_example.sh --model $(pwd)/qat/checkpoint-450/ --quant nvfp4 --export_fmt hf"
+    "bash TensorRT-Model-Optimizer/examples/llm_ptq/scripts/huggingface_example.sh --model $(pwd)/qat/checkpoint-450/ --quant nvfp4"
    ]
   },
   {
diff --git a/examples/speculative_decoding/example.ipynb b/examples/speculative_decoding/example.ipynb
@@ -23,7 +23,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!python llm_ptq/hf_ptq.py --pyt_ckpt_path meta-llama/Llama-3.2-1B-Instruct --qformat fp8 --batch_size 1 --export_path /tmp/llama3.2_1B_fp8 --export_fmt hf"
+    "!python llm_ptq/hf_ptq.py --pyt_ckpt_path meta-llama/Llama-3.2-1B-Instruct --qformat fp8 --batch_size 1 --export_path /tmp/llama3.2_1B_fp8"
    ]
   },
   {
diff --git a/examples/vlm_ptq/README.md b/examples/vlm_ptq/README.md
@@ -81,7 +81,7 @@ For [Qwen2.5-VL](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct):
 
 ```bash
 git clone https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct
-scripts/huggingface_example.sh --type qwen --model Qwen2.5-VL-7B-Instruct --export_fmt hf --quant [fp8|nvfp4|int8_sq|int4_awq|w4a8_awq]
+scripts/huggingface_example.sh --type qwen --model Qwen2.5-VL-7B-Instruct --quant [fp8|nvfp4|int8_sq|int4_awq|w4a8_awq]
 ```
 
 The example scripts above also have an additional flag `--tasks gqa`, which will trigger evaluation of the built TensorRT engine using GQA benchmark. Details of the evaluation is explained in this [tutorial](../vlm_eval/README.md).
diff --git a/tests/examples/llm_ptq/test_llm_ptq.py b/tests/examples/llm_ptq/test_llm_ptq.py
@@ -89,7 +89,6 @@ def test_ptq_whisper(self, command):
         ),
         # kv_cache
         PTQCommand(quant="nvfp4_awq", kv_cache_quant="nvfp4"),
-        #
         # autoquant_kv_cache
         PTQCommand(
             quant="nvfp4,fp8",
diff --git a/tests/examples/speculative_decoding/test_medusa.py b/tests/examples/speculative_decoding/test_medusa.py
@@ -29,7 +29,7 @@ def install_transformers_lt_4_50():
 
 
 # fmt: off
-def _run_hf_ptq(model_path, output_dir, qformat, export_fmt):
+def _run_hf_ptq(model_path, output_dir, qformat):
     run_example_command(
         [
             "python", "hf_ptq.py",
@@ -38,7 +38,6 @@ def _run_hf_ptq(model_path, output_dir, qformat, export_fmt):
             "--calib_size", "64",
             "--export_path", output_dir,
             "--qformat", qformat,
-            "--export_fmt", export_fmt,
         ],
         "llm_ptq",
     )
@@ -66,8 +65,7 @@ def test_llama_medusa_fp8_qat(tiny_llama_path, num_gpus, tiny_daring_anteater_pa
     )
 
     # Test PTQ on Medusa
-    _run_hf_ptq(medusa_path, tmp_path / "medusa-tinyllama-trtllm", "fp8", "tensorrt_llm")
-    _run_hf_ptq(medusa_path, tmp_path / "medusa-tinyllama-hf", "fp8", "hf")
+    _run_hf_ptq(medusa_path, tmp_path / "medusa-tinyllama-hf", "fp8")
 
     # Test QAT on Medusa
     run_example_command(
diff --git a/tests/gpu/torch/export/test_unified_hf_export_and_check_safetensors.py b/tests/gpu/torch/export/test_unified_hf_export_and_check_safetensors.py
@@ -83,8 +83,6 @@ def test_unified_hf_export_and_check_safetensors(
         str(tiny_model_dir),
         "--qformat",
         qformat,
-        "--export_fmt",
-        "hf",
         "--export_path",
         str(output_dir),
     ]

Original file line number	Diff line number	Diff line change
`@@ -691,7 +691,7 @@`
`691`	`691`	`"\n",`
`692`	`692`	`"# run conversion script\n",`
`693`	`693`	`"cd ..\n",`
`694`		`- "bash TensorRT-Model-Optimizer/examples/llm_ptq/scripts/huggingface_example.sh --model $(pwd)/qat/checkpoint-450/ --quant nvfp4 --export_fmt hf"`
	`694`	`+ "bash TensorRT-Model-Optimizer/examples/llm_ptq/scripts/huggingface_example.sh --model $(pwd)/qat/checkpoint-450/ --quant nvfp4"`
`695`	`695`	`]`
`696`	`696`	`},`
`697`	`697`	`{`
Original file line number	Diff line number	Diff line change
`@@ -23,7 +23,7 @@`
`23`	`23`	`"metadata": {},`
`24`	`24`	`"outputs": [],`
`25`	`25`	`"source": [`
`26`		`- "!python llm_ptq/hf_ptq.py --pyt_ckpt_path meta-llama/Llama-3.2-1B-Instruct --qformat fp8 --batch_size 1 --export_path /tmp/llama3.2_1B_fp8 --export_fmt hf"`
	`26`	`+ "!python llm_ptq/hf_ptq.py --pyt_ckpt_path meta-llama/Llama-3.2-1B-Instruct --qformat fp8 --batch_size 1 --export_path /tmp/llama3.2_1B_fp8"`
`27`	`27`	`]`
`28`	`28`	`},`
`29`	`29`	`{`
Original file line number	Diff line number	Diff line change
`@@ -83,8 +83,6 @@ def test_unified_hf_export_and_check_safetensors(`
`83`	`83`	`str(tiny_model_dir),`
`84`	`84`	`"--qformat",`
`85`	`85`	`qformat,`
`86`		`- "--export_fmt",`
`87`		`- "hf",`
`88`	`86`	`"--export_path",`
`89`	`87`	`str(output_dir),`
`90`	`88`	`]`