Skip to content

Commit 6a47146

Browse files
Remove export_fmt usage in other tests and readme
Signed-off-by: Keval Morabia <[email protected]>
1 parent a355999 commit 6a47146

File tree

10 files changed

+10
-16
lines changed

10 files changed

+10
-16
lines changed

.github/CODEOWNERS

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,5 @@ modelopt/torch/utils @NVIDIA/modelopt-torch-utils-codeowners
5050
/examples/onnx_ptq @NVIDIA/modelopt-onnx-codeowners
5151
/examples/pruning @NVIDIA/modelopt-torch-nas-prune-codeowners
5252
/examples/speculative_decoding @NVIDIA/modelopt-torch-speculative-codeowners
53-
/examples/vlm_eval @NVIDIA/modelopt-examples-vlm-codeowners
5453
/examples/vlm_ptq @NVIDIA/modelopt-examples-vlm-codeowners
5554
/examples/windows @NVIDIA/modelopt-windows-codeowners

docs/source/getting_started/_installation_for_Linux.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ Latest Model Optimizer (``nvidia-modelopt``) currently has the following system
1818
+-------------------------+-----------------------------+
1919
| PyTorch | >=2.6 |
2020
+-------------------------+-----------------------------+
21-
| TensorRT-LLM (Optional) | 1.0.0rc6 |
21+
| TensorRT-LLM (Optional) | 1.1.0rc2.post2 |
2222
+-------------------------+-----------------------------+
2323
| ONNX Runtime (Optional) | 1.22 |
2424
+-------------------------+-----------------------------+

examples/llm_ptq/README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -251,7 +251,7 @@ scripts/huggingface_example.sh --model $HF_PATH --quant [fp8|nvfp4|int8_sq|int4_
251251
252252
> *If a GPU OOM error occurs during model quantization despite sufficient memory, setting the --use_seq_device_map flag can help. This enforces sequential device mapping, distributing the model across GPUs and utilizing up to 80% of each GPU's memory.*
253253
254-
> *You can now add `--low_memory_mode` to the command when setting `--export_fmt=hf` to lower the memory requirements of the PTQ process. With this mode, the script will compress model weights to low precision before calibration. This mode is only supported for FP8 and NVFP4 with max calibration.*
254+
> *You can add `--low_memory_mode` to the command to lower the memory requirements of the PTQ process. With this mode, the script will compress model weights to low precision before calibration. This mode is only supported for FP8 and NVFP4 with max calibration.*
255255
256256
#### Deepseek R1
257257

@@ -301,15 +301,15 @@ with torch.inference_mode():
301301
### Quantize and Export
302302

303303
```bash
304-
python hf_ptq.py --pyt_ckpt_path <huggingface_model_card> --qformat fp8 --export_fmt hf --export_path <quantized_ckpt_path> --trust_remote_code
304+
python hf_ptq.py --pyt_ckpt_path <huggingface_model_card> --qformat fp8 --export_path <quantized_ckpt_path> --trust_remote_code
305305
```
306306

307307
### Hugging Face framework [Script](./scripts/huggingface_example.sh)
308308

309309
Alternatively, the framework script `huggingface_example.sh` also supports quantize and export:
310310

311311
```bash
312-
scripts/huggingface_example.sh --model <huggingface_model_card> --quant fp8 --export_fmt hf
312+
scripts/huggingface_example.sh --model <huggingface_model_card> --quant fp8
313313
```
314314

315315
### Deployment

examples/llm_ptq/hf_ptq.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -748,7 +748,7 @@ def output_decode(generated_ids, input_shape):
748748
args = parser.parse_args()
749749

750750
if args.export_fmt != "hf":
751-
warnings.warn("Deprecated. --export_fmt will be ignored.")
751+
warnings.warn("Deprecated. --export_fmt forced to hf.")
752752

753753
args.dataset = args.dataset.split(",") if args.dataset else None
754754
args.calib_size = [int(num_sample) for num_sample in args.calib_size.split(",")]

examples/llm_qat/notebooks/QAT_QAD_Walkthrough.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -691,7 +691,7 @@
691691
"\n",
692692
"# run conversion script\n",
693693
"cd ..\n",
694-
"bash TensorRT-Model-Optimizer/examples/llm_ptq/scripts/huggingface_example.sh --model $(pwd)/qat/checkpoint-450/ --quant nvfp4 --export_fmt hf"
694+
"bash TensorRT-Model-Optimizer/examples/llm_ptq/scripts/huggingface_example.sh --model $(pwd)/qat/checkpoint-450/ --quant nvfp4"
695695
]
696696
},
697697
{

examples/speculative_decoding/example.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
"metadata": {},
2424
"outputs": [],
2525
"source": [
26-
"!python llm_ptq/hf_ptq.py --pyt_ckpt_path meta-llama/Llama-3.2-1B-Instruct --qformat fp8 --batch_size 1 --export_path /tmp/llama3.2_1B_fp8 --export_fmt hf"
26+
"!python llm_ptq/hf_ptq.py --pyt_ckpt_path meta-llama/Llama-3.2-1B-Instruct --qformat fp8 --batch_size 1 --export_path /tmp/llama3.2_1B_fp8"
2727
]
2828
},
2929
{

examples/vlm_ptq/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ For [Qwen2.5-VL](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct):
8181

8282
```bash
8383
git clone https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct
84-
scripts/huggingface_example.sh --type qwen --model Qwen2.5-VL-7B-Instruct --export_fmt hf --quant [fp8|nvfp4|int8_sq|int4_awq|w4a8_awq]
84+
scripts/huggingface_example.sh --type qwen --model Qwen2.5-VL-7B-Instruct --quant [fp8|nvfp4|int8_sq|int4_awq|w4a8_awq]
8585
```
8686

8787
The example scripts above also have an additional flag `--tasks gqa`, which will trigger evaluation of the built TensorRT engine using GQA benchmark. Details of the evaluation is explained in this [tutorial](../vlm_eval/README.md).

tests/examples/llm_ptq/test_llm_ptq.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,6 @@ def test_ptq_whisper(self, command):
8989
),
9090
# kv_cache
9191
PTQCommand(quant="nvfp4_awq", kv_cache_quant="nvfp4"),
92-
#
9392
# autoquant_kv_cache
9493
PTQCommand(
9594
quant="nvfp4,fp8",

tests/examples/speculative_decoding/test_medusa.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ def install_transformers_lt_4_50():
2929

3030

3131
# fmt: off
32-
def _run_hf_ptq(model_path, output_dir, qformat, export_fmt):
32+
def _run_hf_ptq(model_path, output_dir, qformat):
3333
run_example_command(
3434
[
3535
"python", "hf_ptq.py",
@@ -38,7 +38,6 @@ def _run_hf_ptq(model_path, output_dir, qformat, export_fmt):
3838
"--calib_size", "64",
3939
"--export_path", output_dir,
4040
"--qformat", qformat,
41-
"--export_fmt", export_fmt,
4241
],
4342
"llm_ptq",
4443
)
@@ -66,8 +65,7 @@ def test_llama_medusa_fp8_qat(tiny_llama_path, num_gpus, tiny_daring_anteater_pa
6665
)
6766

6867
# Test PTQ on Medusa
69-
_run_hf_ptq(medusa_path, tmp_path / "medusa-tinyllama-trtllm", "fp8", "tensorrt_llm")
70-
_run_hf_ptq(medusa_path, tmp_path / "medusa-tinyllama-hf", "fp8", "hf")
68+
_run_hf_ptq(medusa_path, tmp_path / "medusa-tinyllama-hf", "fp8")
7169

7270
# Test QAT on Medusa
7371
run_example_command(

tests/gpu/torch/export/test_unified_hf_export_and_check_safetensors.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,8 +83,6 @@ def test_unified_hf_export_and_check_safetensors(
8383
str(tiny_model_dir),
8484
"--qformat",
8585
qformat,
86-
"--export_fmt",
87-
"hf",
8886
"--export_path",
8987
str(output_dir),
9088
]

0 commit comments

Comments
 (0)